dear team,
Kindly help on how to work with parallel command to run this my code:
cd "/Users/fodiwuor/Library/CloudStorage/OneDrive-KemriWellcomeTrust/fodiwuor/studies/AASRF_projects/CARRIAGE AND SYTEMATICREVIEW"
**8reading HIV data
*use data/SEXDATA_AGREGATED_zerocasesIncluded,clear
use data/SEXDATA_AGREGATED_zerocasesIncluded2024BEST,clear
***we agreed with Kate to replace mid-year population by half in sex
*gen midyrpopn_ipd5=(midyrpopn_ipd*0.5)
*drop midyrpopn_ipd
*ren midyrpopn_ipd5 midyrpopn_ipd
*****Generating CCR and the standard error (imputing disease=1 if disease==0)
*replace count_ipd=1 if count_ipd==0
*gen incidence=(count_ipd/midyrpopn_ipd)*100000
***where carriage is zero replace prevalence to be 0.05(Christian advise)
*gen prevalence=count_carr/popnsampled_carr
*replace prevalence=0.005 if count_carr==0
*gen CCR=incidence/prevalence
***log
*gen lnCCR=log(CCR)
*gen SElnCCR=sqrt((1/count_ipd)+((1-prevalence)/(prevalence*popnsampled_carr)))
***serotypr
*replace serotype="18A/18B/18C/18F" if serotype=="18C/18F/18B/18A"
*replace serotype="6C/6D" if serotype=="6C/D"
*replace serotype="6A/6B" if serotype=="6A/B"
*replace serotype="NT" if serotype=="NON-TYPABLE"
**We will now compute where disease is zero or carriage is zero using bayesian approach
**START IMPUTING
*browse if year_sampling_carr=="2009" & (country=="The Gambia" | country=="Gambia")
replace PCV="No PCV" if year==2009 & (Country=="GAMBIA")
replace pre_post_vax="pre" if year==2009 & (Country=="GAMBIA")
replace PCV="PCV10" if pre_post_vax=="post" & Country=="Mozambique"
*replace yr_pcv_intro=2009 if year_sampling_carr=="2009" & (country=="The Gambia" | country=="Gambia")
*replace country="The Gambia" if country=="Gambia"
*PCV13 was introduced 2011
*replace yr_pcv_intro=2011 if country=="The Gambia" & pcv=="PCV13"
*browse if country=="The Gambia"
***IMPUTE pcv7 pcv10 pcv13 lmic umic hic age5_14 age15more
**PCV use
codebook PCV
replace PCV=trim(lower(PCV))
codebook PCV
gen pcv7=(PCV=="pcv7")
gen pcv10=(PCV=="pcv10")
gen pcv13=(PCV=="pcv13")
**Income
gen lmic=(Income_grp=="LMIC")
gen umic=(Income_grp=="UMIC")
gen hic=(Income_grp=="HIC")
**Age-group 5_14 and 15more
gen age5_14=(age_ipd=="5-14years")
gen age15more=(age_ipd=="15years & Above")
gen ageallages=(age_ipd=="all_ages")
**gen post pcv
gen postpcv=(pre_post_vax=="post")
**generating incidence and carriage prevalence
gen incidence=(count_ipd/midyrpopn_ipd)*100000
gen prevalence=count_carr/popnsampled_carr
**generating variables to store the results
gen est_eb=.
gen sig=.1147768
gen imputed_preva=.
gen imputed_inci=.
gen no_coverge_preva=.
gen no_coverge_inc=.
**
*levelsof agr, local(levels)
*foreach i of local levels
**predicting carriage prevalence(best way to impute everything at once but it is taking too long (24 hours))
levelsof serotype, local(levels)
foreach i of local levels{
cap betabin count_carr pcv7 pcv10 pcv13 lmic umic hic age5_14 age15more ageallages if serotype=="`i'", n(popnsampled_carr) link(cloglog)
di "`i'"
if _rc==0{
predict mu if serotype=="`i'"
replace imputed_preva=1 if serotype=="`i'"
replace est_eb =(mu/sig + count_carr)/(mu/sig + (1-mu)/sig + popnsampled_carr) if serotype=="`i'"
drop mu
}
***block
else if _rc!=0{
di "model1 did not converge"
cap betabin count_carr postpcv hic age5_14 age15more ageallages if serotype=="`i'", n(popnsampled_carr) link(cloglog)
if _rc==0{
predict mu if serotype=="`i'"
replace imputed_preva=1 if serotype=="`i'"
replace est_eb =(mu/sig + count_carr)/(mu/sig + (1-mu)/sig + popnsampled_carr) if serotype=="`i'"
drop mu
}
else if _rc!=0{
cap betabin count_carr if serotype=="`i'", n(popnsampled_carr) link(cloglog)
if _rc==0{
predict mu if serotype=="`i'"
replace imputed_preva=1 if serotype=="`i'"
replace est_eb =(mu/sig + count_carr)/(mu/sig + (1-mu)/sig + popnsampled_carr) if serotype=="`i'"
drop mu
}
else if _rc!=0{
replace no_coverge_preva=1 if serotype=="`i'"
}
}
}
**end block
}
*** (save the data after carriage prevalence imputation not within the code) save "/Users/fodiwuor/Library/CloudStorage/OneDrive-KemriWellcomeTrust/fodiwuor/studies/AASRF_projects/CARRIAGE AND SYTEMATICREVIEW/data/SEXDATAMPREV_IMPUTED.dta",replace
sort serotype
label var est_eb "Bayes probability carriage"
gen prop_carr_imp=prevalence
replace prop_carr_imp =est_eb if prevalence==0 & !missing(est_eb)
gen count_carr_imp =prop_carr_imp*popnsampled_carr
sort serotype
br prevalence prop_carr_imp Country count_carr_imp Country est_eb
br prevalence prop_carr_imp Country count_carr_imp count_carr est_eb
ren prevalence prop_carr
sort serotype
**check imputed serotypes in carriage
tab serotype if imputed_preva==1,m
order serotype count_carr count_carr_imp est_eb imputed_preva prop_carr prop_carr_imp
br if imputed_preva==1
***saving the data outside the dofile (saved after carriage imputation):save "/Users/fodiwuor/Library/CloudStorage/OneDrive-KemriWellcomeTrust/fodiwuor/studies/AASRF_projects/CARRIAGE AND SYTEMATICREVIEW/data/SEXDATAMPREV_IMPUTED.dta",replace
***use serotypes imputed sex data to manually sellect serotypes for analysis
***Impute disease
gen est_eb_d=.
***try clyde code for imputing disease (Clyde code is faster than mine adapt it latter for carriage imputation)
capture program drop one_serotype
program define one_serotype
cap betabin count_ipd postpcv hic age15more, n(midyrpopn_ipd) link(cloglog)
di serotype[1]
if _rc==0{
predict mu
replace imputed_inci=1
replace est_eb_d = (mu/sig + count_ipd)/(mu/sig + (1-mu)/sig + midyrpopn_ipd)
drop mu
}
***block
else if _rc!=0{
di "model1 did not converge"
***did this to intesionaly crack the code here and do the other model
cap betabin betain count_ipd postpcv hic, n(midyrpopn_ipd) link(cloglog)
if _rc==0{
predict mu
replace imputed_inci=1
replace est_eb_d = (mu/sig + count_ipd)/(mu/sig + (1-mu)/sig + midyrpopn_ipd)
drop mu
}
else if _rc!=0{
cap betabin count_ipd , n(midyrpopn_ipd) link(cloglog)
if _rc==0{
predict mu
replace imputed_inci=1
replace est_eb_d = (mu/sig + count_ipd)/(mu/sig + (1-mu)/sig + midyrpopn_ipd)
drop mu
}
else if _rc!=0{
replace no_coverge_inc=1
}
}
}
exit
end
runby one_serotype, by(serotype) status
**order
order serotype count_carr count_carr_imp est_eb imputed_preva prop_carr prop_carr_imp est_eb_d incidence count_ipd imputed_inci no_coverge_inc
br if imputed_inci==1
***generate some variables post disease imputation
** multiply it by 100,000 to get an incidence per 1000000
replace est_eb_d = (est_eb_d*100000)
ren incidence incidence_ipd
gen incidence_ipd_imp = incidence_ipd
replace incidence_ipd_imp = est_eb_d if incidence_ipd==0 & !missing(est_eb_d)
label var incidence_ipd_imp "imputed IPD replac 0 with est_eb"
gen count_ipd_imp =(incidence_ipd_imp/100000)*midyrpopn_ipd
gen CCR=incidence_ipd/prop_carr
***log
gen lnCCR=log(CCR)
gen SElnCCR=sqrt((1/count_ipd)+((1-prop_carr)/(prop_carr*popnsampled_carr)))
***imputed CCR
generate CCR_imp = incidence_ipd_imp/prop_carr_imp
*codebook CCR_imp if include==1
*natural log
generate lnCCR_imp = log(CCR_imp)
generate varCCR_imp=(1/count_ipd_imp)+(1-prop_carr_imp)/(prop_carr_imp*popnsampled_carr)
generate SElnCCR_imp=sqrt(varCCR_imp)
generate lbCCR_imp=exp(lnCCR_imp-1.96*SElnCCR_imp)
generate ubCCR_imp=exp(lnCCR_imp+1.96*SElnCCR_imp)
order serotype count_carr count_carr_imp est_eb imputed_preva prop_carr prop_carr_imp est_eb_d incidence_ipd count_ipd_imp count_ipd imputed_inci no_coverge_inc
br if imputed_inci==1
***save data outside the dofile after both imputations(data/SEXDATAM1):just use SEXDATAM1 if you want both disease and carriage imputation for sex data. don't rerun the code untill carriage imputation code is edited to run faster (running for 24 hours now).
***serotypes imputed for both carriage and disease
tab serotype if imputed_inci==1 & imputed_preva==1,m
***serotypes imputed in carriage only
tab serotype if imputed_preva==1 & missing(imputed_inci),m
br serotype count_carr count_carr_imp prop_carr prop_carr_imp count_ipd_imp count_ipd ///
incidence_ipd incidence_ipd_imp lnCCR lnCCR_imp if imputed_preva==1 & missing(imputed_inci)
***check missing in lnCCR_imp if imputed_preva==1 & missing(imputed_inci) (420 (19%) non missing lnCCR_imp in this category.Meaning 81% missing.Mainly because of zero ipd counts)
count if imputed_preva==1 & missing(imputed_inci)
count if imputed_preva==1 & missing(imputed_inci) & missing(lnCCR_imp)
**why serotype 1 failing to converge with very good counts of ipd?
betabin count_ipd if serotype=="1" , n(midyrpopn_ipd) link(cloglog)
save data/SEXDATAM,replace
Its taking 2days to run.I am reading parallel and it seems it may make things work faster but have not understood it. Can someone just edit for me the code and place parralel where required. Can I set any number of clusters using parallel or there is away to decide number of cluster depending on your data?
Kindly help on how to work with parallel command to run this my code:
cd "/Users/fodiwuor/Library/CloudStorage/OneDrive-KemriWellcomeTrust/fodiwuor/studies/AASRF_projects/CARRIAGE AND SYTEMATICREVIEW"
**8reading HIV data
*use data/SEXDATA_AGREGATED_zerocasesIncluded,clear
use data/SEXDATA_AGREGATED_zerocasesIncluded2024BEST,clear
***we agreed with Kate to replace mid-year population by half in sex
*gen midyrpopn_ipd5=(midyrpopn_ipd*0.5)
*drop midyrpopn_ipd
*ren midyrpopn_ipd5 midyrpopn_ipd
*****Generating CCR and the standard error (imputing disease=1 if disease==0)
*replace count_ipd=1 if count_ipd==0
*gen incidence=(count_ipd/midyrpopn_ipd)*100000
***where carriage is zero replace prevalence to be 0.05(Christian advise)
*gen prevalence=count_carr/popnsampled_carr
*replace prevalence=0.005 if count_carr==0
*gen CCR=incidence/prevalence
***log
*gen lnCCR=log(CCR)
*gen SElnCCR=sqrt((1/count_ipd)+((1-prevalence)/(prevalence*popnsampled_carr)))
***serotypr
*replace serotype="18A/18B/18C/18F" if serotype=="18C/18F/18B/18A"
*replace serotype="6C/6D" if serotype=="6C/D"
*replace serotype="6A/6B" if serotype=="6A/B"
*replace serotype="NT" if serotype=="NON-TYPABLE"
**We will now compute where disease is zero or carriage is zero using bayesian approach
**START IMPUTING
*browse if year_sampling_carr=="2009" & (country=="The Gambia" | country=="Gambia")
replace PCV="No PCV" if year==2009 & (Country=="GAMBIA")
replace pre_post_vax="pre" if year==2009 & (Country=="GAMBIA")
replace PCV="PCV10" if pre_post_vax=="post" & Country=="Mozambique"
*replace yr_pcv_intro=2009 if year_sampling_carr=="2009" & (country=="The Gambia" | country=="Gambia")
*replace country="The Gambia" if country=="Gambia"
*PCV13 was introduced 2011
*replace yr_pcv_intro=2011 if country=="The Gambia" & pcv=="PCV13"
*browse if country=="The Gambia"
***IMPUTE pcv7 pcv10 pcv13 lmic umic hic age5_14 age15more
**PCV use
codebook PCV
replace PCV=trim(lower(PCV))
codebook PCV
gen pcv7=(PCV=="pcv7")
gen pcv10=(PCV=="pcv10")
gen pcv13=(PCV=="pcv13")
**Income
gen lmic=(Income_grp=="LMIC")
gen umic=(Income_grp=="UMIC")
gen hic=(Income_grp=="HIC")
**Age-group 5_14 and 15more
gen age5_14=(age_ipd=="5-14years")
gen age15more=(age_ipd=="15years & Above")
gen ageallages=(age_ipd=="all_ages")
**gen post pcv
gen postpcv=(pre_post_vax=="post")
**generating incidence and carriage prevalence
gen incidence=(count_ipd/midyrpopn_ipd)*100000
gen prevalence=count_carr/popnsampled_carr
**generating variables to store the results
gen est_eb=.
gen sig=.1147768
gen imputed_preva=.
gen imputed_inci=.
gen no_coverge_preva=.
gen no_coverge_inc=.
**
*levelsof agr, local(levels)
*foreach i of local levels
**predicting carriage prevalence(best way to impute everything at once but it is taking too long (24 hours))
levelsof serotype, local(levels)
foreach i of local levels{
cap betabin count_carr pcv7 pcv10 pcv13 lmic umic hic age5_14 age15more ageallages if serotype=="`i'", n(popnsampled_carr) link(cloglog)
di "`i'"
if _rc==0{
predict mu if serotype=="`i'"
replace imputed_preva=1 if serotype=="`i'"
replace est_eb =(mu/sig + count_carr)/(mu/sig + (1-mu)/sig + popnsampled_carr) if serotype=="`i'"
drop mu
}
***block
else if _rc!=0{
di "model1 did not converge"
cap betabin count_carr postpcv hic age5_14 age15more ageallages if serotype=="`i'", n(popnsampled_carr) link(cloglog)
if _rc==0{
predict mu if serotype=="`i'"
replace imputed_preva=1 if serotype=="`i'"
replace est_eb =(mu/sig + count_carr)/(mu/sig + (1-mu)/sig + popnsampled_carr) if serotype=="`i'"
drop mu
}
else if _rc!=0{
cap betabin count_carr if serotype=="`i'", n(popnsampled_carr) link(cloglog)
if _rc==0{
predict mu if serotype=="`i'"
replace imputed_preva=1 if serotype=="`i'"
replace est_eb =(mu/sig + count_carr)/(mu/sig + (1-mu)/sig + popnsampled_carr) if serotype=="`i'"
drop mu
}
else if _rc!=0{
replace no_coverge_preva=1 if serotype=="`i'"
}
}
}
**end block
}
*** (save the data after carriage prevalence imputation not within the code) save "/Users/fodiwuor/Library/CloudStorage/OneDrive-KemriWellcomeTrust/fodiwuor/studies/AASRF_projects/CARRIAGE AND SYTEMATICREVIEW/data/SEXDATAMPREV_IMPUTED.dta",replace
sort serotype
label var est_eb "Bayes probability carriage"
gen prop_carr_imp=prevalence
replace prop_carr_imp =est_eb if prevalence==0 & !missing(est_eb)
gen count_carr_imp =prop_carr_imp*popnsampled_carr
sort serotype
br prevalence prop_carr_imp Country count_carr_imp Country est_eb
br prevalence prop_carr_imp Country count_carr_imp count_carr est_eb
ren prevalence prop_carr
sort serotype
**check imputed serotypes in carriage
tab serotype if imputed_preva==1,m
order serotype count_carr count_carr_imp est_eb imputed_preva prop_carr prop_carr_imp
br if imputed_preva==1
***saving the data outside the dofile (saved after carriage imputation):save "/Users/fodiwuor/Library/CloudStorage/OneDrive-KemriWellcomeTrust/fodiwuor/studies/AASRF_projects/CARRIAGE AND SYTEMATICREVIEW/data/SEXDATAMPREV_IMPUTED.dta",replace
***use serotypes imputed sex data to manually sellect serotypes for analysis
***Impute disease
gen est_eb_d=.
***try clyde code for imputing disease (Clyde code is faster than mine adapt it latter for carriage imputation)
capture program drop one_serotype
program define one_serotype
cap betabin count_ipd postpcv hic age15more, n(midyrpopn_ipd) link(cloglog)
di serotype[1]
if _rc==0{
predict mu
replace imputed_inci=1
replace est_eb_d = (mu/sig + count_ipd)/(mu/sig + (1-mu)/sig + midyrpopn_ipd)
drop mu
}
***block
else if _rc!=0{
di "model1 did not converge"
***did this to intesionaly crack the code here and do the other model
cap betabin betain count_ipd postpcv hic, n(midyrpopn_ipd) link(cloglog)
if _rc==0{
predict mu
replace imputed_inci=1
replace est_eb_d = (mu/sig + count_ipd)/(mu/sig + (1-mu)/sig + midyrpopn_ipd)
drop mu
}
else if _rc!=0{
cap betabin count_ipd , n(midyrpopn_ipd) link(cloglog)
if _rc==0{
predict mu
replace imputed_inci=1
replace est_eb_d = (mu/sig + count_ipd)/(mu/sig + (1-mu)/sig + midyrpopn_ipd)
drop mu
}
else if _rc!=0{
replace no_coverge_inc=1
}
}
}
exit
end
runby one_serotype, by(serotype) status
**order
order serotype count_carr count_carr_imp est_eb imputed_preva prop_carr prop_carr_imp est_eb_d incidence count_ipd imputed_inci no_coverge_inc
br if imputed_inci==1
***generate some variables post disease imputation
** multiply it by 100,000 to get an incidence per 1000000
replace est_eb_d = (est_eb_d*100000)
ren incidence incidence_ipd
gen incidence_ipd_imp = incidence_ipd
replace incidence_ipd_imp = est_eb_d if incidence_ipd==0 & !missing(est_eb_d)
label var incidence_ipd_imp "imputed IPD replac 0 with est_eb"
gen count_ipd_imp =(incidence_ipd_imp/100000)*midyrpopn_ipd
gen CCR=incidence_ipd/prop_carr
***log
gen lnCCR=log(CCR)
gen SElnCCR=sqrt((1/count_ipd)+((1-prop_carr)/(prop_carr*popnsampled_carr)))
***imputed CCR
generate CCR_imp = incidence_ipd_imp/prop_carr_imp
*codebook CCR_imp if include==1
*natural log
generate lnCCR_imp = log(CCR_imp)
generate varCCR_imp=(1/count_ipd_imp)+(1-prop_carr_imp)/(prop_carr_imp*popnsampled_carr)
generate SElnCCR_imp=sqrt(varCCR_imp)
generate lbCCR_imp=exp(lnCCR_imp-1.96*SElnCCR_imp)
generate ubCCR_imp=exp(lnCCR_imp+1.96*SElnCCR_imp)
order serotype count_carr count_carr_imp est_eb imputed_preva prop_carr prop_carr_imp est_eb_d incidence_ipd count_ipd_imp count_ipd imputed_inci no_coverge_inc
br if imputed_inci==1
***save data outside the dofile after both imputations(data/SEXDATAM1):just use SEXDATAM1 if you want both disease and carriage imputation for sex data. don't rerun the code untill carriage imputation code is edited to run faster (running for 24 hours now).
***serotypes imputed for both carriage and disease
tab serotype if imputed_inci==1 & imputed_preva==1,m
***serotypes imputed in carriage only
tab serotype if imputed_preva==1 & missing(imputed_inci),m
br serotype count_carr count_carr_imp prop_carr prop_carr_imp count_ipd_imp count_ipd ///
incidence_ipd incidence_ipd_imp lnCCR lnCCR_imp if imputed_preva==1 & missing(imputed_inci)
***check missing in lnCCR_imp if imputed_preva==1 & missing(imputed_inci) (420 (19%) non missing lnCCR_imp in this category.Meaning 81% missing.Mainly because of zero ipd counts)
count if imputed_preva==1 & missing(imputed_inci)
count if imputed_preva==1 & missing(imputed_inci) & missing(lnCCR_imp)
**why serotype 1 failing to converge with very good counts of ipd?
betabin count_ipd if serotype=="1" , n(midyrpopn_ipd) link(cloglog)
save data/SEXDATAM,replace
Its taking 2days to run.I am reading parallel and it seems it may make things work faster but have not understood it. Can someone just edit for me the code and place parralel where required. Can I set any number of clusters using parallel or there is away to decide number of cluster depending on your data?
Comment