I tried to find Stata module to count unique alphabets in word, but can't find one. For example, the word "alberta" has a, b, e, l, r, and t. So the new variable displays 6.
-
Login or Register
- Log in with
* Example generated by -dataex-. For more info, type help dataex clear input str20 province "alberta" "new brunswick" "prince edward island" end gen length = strlen(province) su length quietly forval j = 1/`r(max)' { gen char`j' = substr(province, `j', 1) if substr(province, `j', 1) != "" } egen wanted = rowsvals(char*) l province wanted +-------------------------------+ | province wanted | |-------------------------------| 1. | alberta 6 | 2. | new brunswick 11 | 3. | prince edward island 12 | +-------------------------------+ drop char*
gen byte nchars = 0 forvalues codepoint = 65/255 { // parts of unicode blocks "Basic Latin" and "Latin-1 Supplement" if ( uisletter(uchar(`codepoint')) ) { replace nchars = nchars + 1 if ustrpos(ustrlower(province), uchar(`codepoint') ) } }
gen char`j' = usubstr(ustrlower(province), `j', 1) if usubstr(ustrlower(province), `j', 1) != ""
clear * cls input str30 input "Ontario" "Alberta" "New Brunswick" "Prince Edward Island" "München" "Malmö" "L'Aquila" "Emiglia-Romagna" end gen textonly = ustrregexra(input, "\P{L}", "", 1) replace textonly = ustrlower(textonly) // <-- comment line if you care about capitalization gen textlen = ustrlen(textonly) gen unique_letters = "" gen next_letter = "" gen remaining = textonly summ textlen, meanonly forval i = 1/`r(max)' { qui replace next_letter = usubstr(remaining, 1, 1) qui replace unique_letters = unique_letters + next_letter qui replace remaining = ustrregexra(remaining, next_letter, "", 0) } drop textonly next_letter remaining gen n_unique = ustrlen(unique_letters) list, sep(0) abbrev(20)
+------------------------------------------------------------+ | input textlen unique_letters n_unique | |------------------------------------------------------------| 1. | Ontario 7 ontari 6 | 2. | Alberta 7 albert 6 | 3. | New Brunswick 12 newbrusick 10 | 4. | Prince Edward Island 18 princedwasl 11 | 5. | München 7 münche 6 | 6. | Malmö 5 malö 4 | 7. | L'Aquila 7 laqui 5 | 8. | Emiglia-Romagna 14 emiglaron 9 | +------------------------------------------------------------+
Comment