Announcement

Collapse
No announcement yet.
X
  • Filter
  • Time
  • Show
Clear All
new posts

  • Multiple Imputation with Panel Data and Negative Values

    Hi,

    I am trying to impute missing values from UCR crime data using a chained equation model with linear regression. However, the model generates negative crime values that are implausible. Thus, it would be really helpful if someone could suggest how to address this issue. This is my first time imputing for missing values, so it would be really helpful if someone could respond.


    mi set wide

    mi register imputed Murder Manslaughter Rape Robbery Assault Burglary Larceny_Theft Vehicle_Theft Other_Assault Arson Forgery Fraud Embezzlement Stolen_Property Vandalism Weapons Prostitution Drugs_Offenses Drugs_Sale Drugs_Possesion DUI Disorderly

    mi impute chained (regress) Murder Manslaughter Rape Robbery Assault Burglary Larceny_Theft ///
    Vehicle_Theft Other_Assault Arson Forgery Fraud Embezzlement Stolen_Property Vandalism Weapons ///
    Prostitution Drugs_Offenses Drugs_Sale Drugs_Possesion DUI Disorderly ///
    = Tot_Officers Tot_Civ_Emp Unemp_Rate Avg_Weekly_Wage GDP_PerCap Male_Perc Hispanic_Perc Black_Perc Age_1624_Perc, add(20) rseed(12345) nolegend noisily noimputed force bootstrap

    local vars "Murder Manslaughter Rape Robbery Assault Burglary Larceny_Theft Vehicle_Theft Other_Assault Arson Forgery Fraud Embezzlement Stolen_Property Vandalism Weapons Prostitution Drugs_Offenses Drugs_Sale Drugs_Possesion DUI Disorderly"
    foreach var of local vars {
    egen `var'_ImpMean = rowmean(_1_`var'-_20_`var')
    }


    input float ori str9 ORI double fips str20 state str33 county float year double(Murder _1_Murder) float Murder_ImpMean
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 1990 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 1990 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 1990 1 1 1
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 1991 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 1991 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 1991 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 1992 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 1992 0 0 0
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 1992 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 1993 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 1993 0 0 0
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 1993 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 1994 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 1994 0 0 0
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 1994 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 1995 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 1995 0 0 0
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 1995 0 0 0
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 1996 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 1996 1 1 1
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 1996 0 0 0
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 1997 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 1997 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 1997 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 1998 0 0 0
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 1998 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 1998 1 1 1
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 1999 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 1999 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 1999 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 2000 1 1 1
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 2000 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 2000 0 0 0
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 2001 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 2001 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 2001 1 1 1
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 2002 1 1 1
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 2002 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 2002 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 2003 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 2003 0 0 0
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 2003 0 0 0
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 2004 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 2004 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 2004 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 2005 1 1 1
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 2005 0 0 0
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 2005 . -8.090556023485881 -3.029669
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 2006 . 1.9373855949781276 -2.501579
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 2006 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 2006 0 0 0
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 2007 1 1 1
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 2007 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 2007 0 0 0
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 2008 1 1 1
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 2008 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 2008 0 0 0
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 2009 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 2009 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 2009 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 2010 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 2010 0 0 0
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 2010 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 2011 . -1.0829318935674845 -1.9352152
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 2011 . -7.027801633499241 -.6673061
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 2011 . 1.1019194333660172 3.181552
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 2012 . -6.335854940855695 -.07712661
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 2012 . -9.159714860346124 .8726751
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 2012 . 2.9786409098697253 -.017971981
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 2013 . -3.053980170947446 -.15758996
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 2013 . -2.067913775015881 .13371472
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 2013 . -.9866195381136287 -.25479743
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 2014 0 0 0
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 2014 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 2014 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 2015 0 0 0
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 2015 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 2015 3 3 3
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 2016 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 2016 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 2016 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 2017 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 2017 1 1 1
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 2017 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 2018 0 0 0
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 2018 0 0 0
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 2018 0 0 0
    11 "AL00508" 1003 "ALABAMA" "BALDWIN" 2019 1 1 1
    10 "AL00505" 1003 "ALABAMA" "BALDWIN" 2019 1 1 1
    9 "AL00501" 1003 "ALABAMA" "BALDWIN" 2019 0 0 0
    12 "AL00600" 1005 "ALABAMA" "BARBOUR" 1990 3 3 3
    12 "AL00600" 1005 "ALABAMA" "BARBOUR" 1991 2 2 2
    12 "AL00600" 1005 "ALABAMA" "BARBOUR" 1992 0 0 0
    12 "AL00600" 1005 "ALABAMA" "BARBOUR" 1993 1 1 1
    12 "AL00600" 1005 "ALABAMA" "BARBOUR" 1994 0 0 0
    12 "AL00600" 1005 "ALABAMA" "BARBOUR" 1995 0 0 0
    12 "AL00600" 1005 "ALABAMA" "BARBOUR" 1996 0 0 0
    12 "AL00600" 1005 "ALABAMA" "BARBOUR" 1997 . -.8895328461551177 2.080324
    12 "AL00600" 1005 "ALABAMA" "BARBOUR" 1998 . 4.434697612914603 1.965525
    12 "AL00600" 1005 "ALABAMA" "BARBOUR" 1999 1 1 1
    end

  • #2
    When all the variables in your model are either continuous or binary, I suggest you use PMM. I am not sure why negative values are produced. Can you share some part of the original data? Please make sure to check the original data very carefully (any not correctly coded missing values, something like -999?)

    I do not understand at all the last step where you mean average the imputations. This should almost never be done. If you want to analyze the imputed value, use:

    Code:
    mi estimate: regress Murder
    I would use the following command, as a suggestion:

    Code:
    mi set flong
    
    mi register imputed Murder Manslaughter Rape Robbery Assault Burglary Larceny_Theft Vehicle_Theft Other_Assault Arson Forgery Fraud Embezzlement Stolen_Property Vandalism Weapons Prostitution Drugs_Offenses Drugs_Sale Drugs_Possesion DUI Disorderly
    
    mi impute chained (pmm, knn(5)) Murder Manslaughter Rape Robbery Assault Burglary Larceny_Theft ///
    Vehicle_Theft Other_Assault Arson Forgery Fraud Embezzlement Stolen_Property Vandalism Weapons ///
    Prostitution Drugs_Offenses Drugs_Sale Drugs_Possesion DUI Disorderly ///
    = Tot_Officers Tot_Civ_Emp Unemp_Rate Avg_Weekly_Wage GDP_PerCap Male_Perc Hispanic_Perc Black_Perc Age_1624_Perc ///
    , add(5) burnin(20) rseed(12345) dots
    If you need to use FORCE, this means there are some problems with your data usually. In most cases, system missings are not correctly coded or removed (values like .a .b or so).
    Best wishes

    (Stata 16.1 MP)

    Comment


    • #3
      Felix,

      Thanks for the great advice. The imputation worked with the pmm estimation technique and no longer yields negative value. Is there a rule of thumb to choose the bandwidth? I have atmost 5 consecutive missing observation rows. The data has been cleaned thoroughly to remove any other errors.

      Comment


      • #4
        The KNN option is the number of matched donors to pick. As a general rule of thumb, this value should never be smaller than 2-3 and increase with your total sample size. I usually have 4-5 if the sample size is below 5k and 5-10 if the sample size is between 5k and 10k.
        Best wishes

        (Stata 16.1 MP)

        Comment


        • #5
          Is this is the sample size of the missing observations or of the entire sample complete and missing?

          Comment


          • #6
            Total sample size. However, share of missingness should be below 50%. Imputing more information can be problematic.
            Best wishes

            (Stata 16.1 MP)

            Comment


            • #7
              My N= 108690 and N_missing = 1931. Any suggestions on knn?

              Comment


              • #8
                This looks great. Potentially, you can even skip imputation at all and just perform listwise deletion... But of course, imputation can still improve, even if only very little. I think KNN of 15 should work fine.
                Best wishes

                (Stata 16.1 MP)

                Comment


                • #9
                  I am looking to create a balanced panel, my sample period is from 1990-2019. Does that inform any decision? Most of my missing observations are concentrated in a couple of states.

                  Comment


                  • #10
                    Sounds like a good idea. If you think that state has an extra role, you can impute by state by adding the option; by(state). This works as long as there is still enough complete information available within each state.
                    Best wishes

                    (Stata 16.1 MP)

                    Comment


                    • #11
                      Is there any general rule of thumb, for choosing the by(group)? Currently, I am imputing a dataset where the dependent (imputed) variables are at the agency-level while all the other independent variables (covariates) are observed at the county level. My goal is to impute at the agency-year level and then aggregate the data to the county-year level, in order to minimize aggregation bias. Do you think this should be okay?

                      Comment


                      • #12
                        This is not an easy question. I would suggest using the level most appropriate for the variables that have the highest missingness rates. And btw, "dependent" and "imputed" are not the same. Yes, you can impute the dependent variable as well, but this only helps your analysis if there are strong predictors available.
                        Best wishes

                        (Stata 16.1 MP)

                        Comment


                        • #13
                          Can you advise if I should use the noimputed option? Given that I am imputing arrest rates for various crimes. I am not sure if that would be informative.

                          Comment


                          • #14
                            Personally, I have never used this option and I do not think it is required. Maybe other users have different experience.
                            Best wishes

                            (Stata 16.1 MP)

                            Comment


                            • #15
                              After running mi impute with knn=15 and m=20 I see that for the missing values, the imputations are very much biased downwards. Is there any to check the quality of imputation? I had generated mean of all the imputation to see the average values.

                              mi impute chained (pmm, knn(15)) Murder Manslaughter Rape Robbery Assault Burglary Larceny_Theft ///
                              Vehicle_Theft Other_Assault Arson Forgery Fraud Embezzlement Stolen_Property Vandalism Weapons ///
                              Prostitution Drugs_Offenses Drugs_Sale Drugs_Possesion DUI Disorderly ///
                              = Tot_Officers Tot_Civ_Emp i.Major i.Minor Unemp_Rate Male_Perc Hispanic_Perc ///
                              Black_Perc Age_1624_Perc PerCapWageL PopDensityL ///
                              , add(20) rseed(12345) nolegend noisily bootstrap noimputed

                              clear
                              input double fips float ori str21 Imputation_Status int(Murder Robbery Assault) float(Murder_ImpMean Robbery_ImpMean Assault_ImpMean)
                              1073 1 "NA" 2 19 22 2 19 22
                              1073 1 "NA" 0 20 16 0 20 16
                              1073 1 "NA" 7 38 10 7 38 10
                              1073 1 "NA" 3 11 13 3 11 13
                              1073 1 "NA" 1 24 25 1 24 25
                              1073 1 "NA" 1 15 15 1 15 15
                              1073 1 "NA" 1 19 16 1 19 16
                              1073 1 "NA" 0 13 20 0 13 20
                              1073 1 "NA" 4 13 7 4 13 7
                              1073 1 "NA" 0 10 6 0 10 6
                              1073 1 "Historical Imputation" 3 14 7 3 14 7
                              1073 1 "NA" 6 18 8 6 18 8
                              1073 1 "NA" 0 7 14 0 7 14
                              1073 1 "NA" 6 6 1 6 6 1
                              1073 1 "NA" 3 3 9 3 3 9
                              1073 1 "NA" 4 6 8 4 6 8
                              1073 1 "NA" 1 10 14 1 10 14
                              1073 1 "NA" 3 8 23 3 8 23
                              1073 1 "NA" 3 22 19 3 22 19
                              1073 1 "NA" 0 6 12 0 6 12
                              1073 1 "NA" 2 4 14 2 4 14
                              1073 1 "Multiple Imputation" . . . 1 1 25
                              1073 1 "Multiple Imputation" . . . 1 1 21
                              1073 1 "Multiple Imputation" . . . 0 8 20
                              1073 1 "NA" 2 7 11 2 7 11
                              1073 1 "Historical Imputation" 3 6 9 3 6 9
                              1073 1 "NA" 3 4 6 3 4 6
                              1073 1 "NA" 5 8 11 5 8 11
                              1073 1 "NA" 4 7 12 4 7 12
                              1073 1 "Multiple Imputation" . . . 0 6 5
                              1073 2 "NA" 3 5 15 3 5 15
                              1073 2 "NA" 1 8 14 1 8 14
                              1073 2 "NA" 0 1 4 0 1 4
                              1073 2 "NA" 0 6 7 0 6 7
                              1073 2 "NA" 2 8 11 2 8 11
                              1073 2 "NA" 0 0 11 0 0 11
                              1073 2 "NA" 3 2 21 3 2 21
                              1073 2 "NA" 5 13 15 5 13 15
                              1073 2 "NA" 1 6 11 1 6 11
                              1073 2 "NA" 0 10 17 0 10 17
                              1073 2 "Historical Imputation" 1 9 14 1 9 14
                              1073 2 "NA" 2 8 10 2 8 10
                              1073 2 "NA" 0 13 13 0 13 13
                              1073 2 "NA" 1 1 7 1 1 7
                              1073 2 "NA" 0 5 10 0 5 10
                              1073 2 "NA" 0 1 14 0 1 14
                              1073 2 "NA" 0 3 14 0 3 14
                              1073 2 "NA" 0 11 1 0 11 1
                              1073 2 "NA" 1 5 10 1 5 10
                              1073 2 "NA" 0 5 7 0 5 7
                              1073 2 "NA" 0 11 6 0 11 6
                              1073 2 "Multiple Imputation" . . . 1 2 17
                              1073 2 "Multiple Imputation" . . . 0 1 15
                              1073 2 "Multiple Imputation" . . . 1 2 18
                              1073 2 "NA" 0 5 11 0 5 11
                              1073 2 "NA" 0 2 15 0 2 15
                              1073 2 "NA" 0 2 10 0 2 10
                              1073 2 "NA" 0 0 8 0 0 8
                              1073 2 "Multiple Imputation" . . . 0 2 13
                              1073 2 "Multiple Imputation" . . . 1 2 13
                              1073 3 "NA" 1 1 5 1 1 5
                              1073 3 "NA" 0 0 5 0 0 5
                              1073 3 "NA" 1 2 0 1 2 0
                              1073 3 "NA" 0 1 8 0 1 8
                              1073 3 "NA" 0 4 6 0 4 6
                              1073 3 "NA" 0 12 11 0 12 11
                              1073 3 "NA" 0 2 9 0 2 9
                              1073 3 "NA" 1 2 14 1 2 14
                              1073 3 "NA" 1 7 18 1 7 18
                              1073 3 "NA" 0 1 10 0 1 10
                              1073 3 "NA" 0 1 22 0 1 22
                              1073 3 "NA" 0 8 17 0 8 17
                              1073 3 "NA" 2 1 10 2 1 10
                              1073 3 "NA" 0 3 3 0 3 3
                              1073 3 "NA" 1 0 1 1 0 1
                              1073 3 "Multiple Imputation" . . . 0 1 13
                              1073 3 "Multiple Imputation" . . . 0 1 10
                              1073 3 "NA" 0 1 8 0 1 8
                              1073 3 "NA" 0 5 0 0 5 0
                              1073 3 "NA" 0 2 1 0 2 1
                              1073 3 "NA" 2 1 2 2 1 2
                              1073 3 "Multiple Imputation" . . . 1 1 8
                              1073 3 "Multiple Imputation" . . . 0 1 9
                              1073 3 "Multiple Imputation" . . . 0 2 9
                              1073 3 "NA" 0 0 3 0 0 3
                              1073 3 "NA" 0 1 2 0 1 2
                              1073 3 "NA" 1 1 3 1 1 3
                              1073 3 "NA" 1 0 7 1 0 7
                              1073 3 "NA" 0 1 0 0 1 0
                              1073 3 "Multiple Imputation" . . . 0 0 12
                              1073 4 "NA" 1 12 16 1 12 16
                              1073 4 "NA" 2 16 13 2 16 13
                              1073 4 "NA" 0 10 8 0 10 8
                              1073 4 "NA" 4 11 9 4 11 9
                              1073 4 "NA" 1 13 14 1 13 14
                              1073 4 "NA" 0 24 18 0 24 18
                              1073 4 "NA" 0 14 19 0 14 19
                              1073 4 "NA" 4 14 15 4 14 15
                              1073 4 "NA" 3 13 15 3 13 15
                              1073 4 "NA" 1 23 21 1 23 21
                              end
                              [/CODE]

                              Comment

                              Working...
                              X