Dear Statalist,
I am currently computing a multivariate Nadaraya-Watson estimator in Mata.
The problem is the following: I have two group identifiers G and T, a binary variable of interest D and covariates X1,...,XJ which form a matrix X. I want to estimate E(D|G=1, T=0) at values X=x in the population {G=1, T=1}. So I want to run a multivariate kernel regression for every individual in the population {G=1, T=1}. In my code, I am using matrices of size (card population {G=1, T=1}, cardinal population {G=1,T=0}). Mata already refused once to allocate memory for a 15000 by 15000 matrix with only two covariates.
Any idea how to make my solution less memory intensive?
I attached the formula I want to compute in a .docx file.
I also give you some replicable code.
Thanks in advance.
clear all
clear mata
program mv_ker_stata, byable(recall) rclass
syntax varlist(numeric min=4) [if] [in] [, kervar(name)]
marksample touse
tokenize `varlist', parse(" ,")
args G T D
tempname counter n11 n10
tempvar isI11 isI10
local toremove="`G' `T' `D'"
local new_varlist: list varlist-toremove
/*number of covariates*/
local count_new: word count of `new_varlist'
scalar `counter'=`count_new'
/*create equivalent of `touse' on populations of interest*/
quietly gen `isI11'=(`G'==1 & `T'==1 & `touse')
quietly gen `isI10'=(`G'==1 & `T'==0 & `touse')
/*size of populations of interest to get matrix dimensions*/
quietly count if `isI11'==1
scalar `n11'=r(N)
quietly count if `isI10'==1
scalar `n10'=r(N)
mata: mv_ker("`new_varlist'","`D'","`counter'","`n10'"," `n11'","`isI10'","`isI11'","`kervar'")
end
version 13
mata:
mata set matastrict on
void mv_ker(string scalar varlist, string scalar d, ///
string scalar nvar, string scalar n10, string scalar n11, string scalar isI10, string scalar isI11, string scalar ker_var) {
real matrix X10,X11,D,v1,v2,bwidth,prod,sum_ker,crossprod,res
real scalar nvars,N10,N11,i
pointer(real matrix) rowvector ptr1,ptr2
X10=X11=D=res=.
st_view(X10,.,tokens(varlist),isI10)
st_view(X11,.,tokens(varlist),isI11)
st_view(D,.,d,isI10)
st_view(res,.,st_addvar("double",ker_var),isI11)
nvars=st_numscalar(nvar)
N10=st_numscalar(n10)
N11=st_numscalar(n11)
v1=J(1,N10,1)
v2=J(N11,1,1)
/*vector of bandwidth (Silverman rule) for every covariate*/
bwidth=(diagonal(variance(X10))')*((4/(nvars+2))^(1/(nvars+4)))/(N10^(1/(nvars+4)))
prod=J(N11,N10,1)
ptr1=J(1,nvars,NULL)
ptr2=J(1,nvars,NULL)
/*display population sizes to give an idea of how large the matrices are*/
N10
N11
for (i=1;i<nvars;i++) {
real matrix v3,v4,v5
/*create huge matrix with elements (x-X_i) for every i in population {G=1,T=0} and every x in population {G=1,T=1}*/
v3=X11[.,i]
v4=v3#v1
v3=X10[.,i]'
v5=v2#v3
ptr1[i]=&(J(N11,N10,NULL))
(*ptr1[i])=(1/bwidth[1,i])*(v4-v5)
ptr2[i]=&(J(N11,N10,NULL))
(*ptr2[i])=abs((*ptr1[i]))
(*ptr2[i])=((*ptr2[i]):<=1)
(*ptr1[i])=(*ptr1[i]):*(*ptr2[i])
/*epan2 kernel*/
(*ptr1[i])=0.75*(1:-((*ptr1[i]):*(*ptr1[i])))
/*multiply kernels for all the covariates:*/
prod[.,.]=prod:*(*ptr1[i])
}
/*get final vector*/
sum_ker=rowsum(prod)
crossprod=prod*D
res[.,.]=crossprod:/sum_ker
}
end
/*TEST THE COMMAND:*/
set more off
local numlist="1000 10000 20000 30000"
local count: word count of `numlist'
local count_minus=`count'-1
forvalues j=1(1)`count_minus' {
local word`j': word `j' of `numlist'
display `word`j''
set seed 1
set obs `word`j''
gen G`j'=(uniform()>0.5)
gen T`j'=(uniform()>0.5)
gen D`j'=(uniform()<0.3)
local varlist=""
forvalues i=1(1)9 {
gen var_`j'`i'=rnormal()
local varlist="`varlist' var_`j'`i'"
timer on `j'`i'
mv_ker_stata G`j' T`j' D`j' `varlist', kervar(kervar_`j'`i')
timer off `j'`i'
timer list `j'`i'
}
}
I am currently computing a multivariate Nadaraya-Watson estimator in Mata.
The problem is the following: I have two group identifiers G and T, a binary variable of interest D and covariates X1,...,XJ which form a matrix X. I want to estimate E(D|G=1, T=0) at values X=x in the population {G=1, T=1}. So I want to run a multivariate kernel regression for every individual in the population {G=1, T=1}. In my code, I am using matrices of size (card population {G=1, T=1}, cardinal population {G=1,T=0}). Mata already refused once to allocate memory for a 15000 by 15000 matrix with only two covariates.
Any idea how to make my solution less memory intensive?
I attached the formula I want to compute in a .docx file.
I also give you some replicable code.
Thanks in advance.
clear all
clear mata
program mv_ker_stata, byable(recall) rclass
syntax varlist(numeric min=4) [if] [in] [, kervar(name)]
marksample touse
tokenize `varlist', parse(" ,")
args G T D
tempname counter n11 n10
tempvar isI11 isI10
local toremove="`G' `T' `D'"
local new_varlist: list varlist-toremove
/*number of covariates*/
local count_new: word count of `new_varlist'
scalar `counter'=`count_new'
/*create equivalent of `touse' on populations of interest*/
quietly gen `isI11'=(`G'==1 & `T'==1 & `touse')
quietly gen `isI10'=(`G'==1 & `T'==0 & `touse')
/*size of populations of interest to get matrix dimensions*/
quietly count if `isI11'==1
scalar `n11'=r(N)
quietly count if `isI10'==1
scalar `n10'=r(N)
mata: mv_ker("`new_varlist'","`D'","`counter'","`n10'"," `n11'","`isI10'","`isI11'","`kervar'")
end
version 13
mata:
mata set matastrict on
void mv_ker(string scalar varlist, string scalar d, ///
string scalar nvar, string scalar n10, string scalar n11, string scalar isI10, string scalar isI11, string scalar ker_var) {
real matrix X10,X11,D,v1,v2,bwidth,prod,sum_ker,crossprod,res
real scalar nvars,N10,N11,i
pointer(real matrix) rowvector ptr1,ptr2
X10=X11=D=res=.
st_view(X10,.,tokens(varlist),isI10)
st_view(X11,.,tokens(varlist),isI11)
st_view(D,.,d,isI10)
st_view(res,.,st_addvar("double",ker_var),isI11)
nvars=st_numscalar(nvar)
N10=st_numscalar(n10)
N11=st_numscalar(n11)
v1=J(1,N10,1)
v2=J(N11,1,1)
/*vector of bandwidth (Silverman rule) for every covariate*/
bwidth=(diagonal(variance(X10))')*((4/(nvars+2))^(1/(nvars+4)))/(N10^(1/(nvars+4)))
prod=J(N11,N10,1)
ptr1=J(1,nvars,NULL)
ptr2=J(1,nvars,NULL)
/*display population sizes to give an idea of how large the matrices are*/
N10
N11
for (i=1;i<nvars;i++) {
real matrix v3,v4,v5
/*create huge matrix with elements (x-X_i) for every i in population {G=1,T=0} and every x in population {G=1,T=1}*/
v3=X11[.,i]
v4=v3#v1
v3=X10[.,i]'
v5=v2#v3
ptr1[i]=&(J(N11,N10,NULL))
(*ptr1[i])=(1/bwidth[1,i])*(v4-v5)
ptr2[i]=&(J(N11,N10,NULL))
(*ptr2[i])=abs((*ptr1[i]))
(*ptr2[i])=((*ptr2[i]):<=1)
(*ptr1[i])=(*ptr1[i]):*(*ptr2[i])
/*epan2 kernel*/
(*ptr1[i])=0.75*(1:-((*ptr1[i]):*(*ptr1[i])))
/*multiply kernels for all the covariates:*/
prod[.,.]=prod:*(*ptr1[i])
}
/*get final vector*/
sum_ker=rowsum(prod)
crossprod=prod*D
res[.,.]=crossprod:/sum_ker
}
end
/*TEST THE COMMAND:*/
set more off
local numlist="1000 10000 20000 30000"
local count: word count of `numlist'
local count_minus=`count'-1
forvalues j=1(1)`count_minus' {
local word`j': word `j' of `numlist'
display `word`j''
set seed 1
set obs `word`j''
gen G`j'=(uniform()>0.5)
gen T`j'=(uniform()>0.5)
gen D`j'=(uniform()<0.3)
local varlist=""
forvalues i=1(1)9 {
gen var_`j'`i'=rnormal()
local varlist="`varlist' var_`j'`i'"
timer on `j'`i'
mv_ker_stata G`j' T`j' D`j' `varlist', kervar(kervar_`j'`i')
timer off `j'`i'
timer list `j'`i'
}
}
Comment