import os import csv from math import log from scipy.special import gammaln # ns, |s|, nUS, totUS # Please pass floats only to this function, otherwise there may be problems with division. # Log Likelihood, Poisson def Statiness(ns,s,nUS,US): nsb = nUS*float(s)/float(US) if ns >= nsb: return gammaln(ns+1)-gammaln(nsb+1)-(ns-nsb)*log(nsb) else: return 0. # Log Likelihood, Binomial dist. # Chance of finding ns occurences of name out of s names with probability nus/us #def Statiness(ns,s,nUS,US): # nsb = nUS*s/US # if ns >= nsb: # return -gammaln(s+1)+gammaln(1+ns)+gammaln(1+s-ns)-ns*log(nUS/US)-(s-ns)*log(1-nUS/US) # else: # return 0. # Difference of abundances #def Statiness(ns,s,nUS,US): # nsb = nUS*float(s)/float(US) # if ns >= nsb: # return ns/s-nUS/US # else: # return 0. def notExcluded(string,exclList): for test in exclList: if string == test: return False else: return True # Specify: Year Of Birth range, Sex, # Max Number of names to consider per state # list of excluded names (e.g. 'Infant') minYear = 2010-30 maxYear = 2010-20 sex = 'F' # Names to exclude from analysis. ExcludedNames = ['Infant','Baby','Male','Boy','Female','Girl','Unknown','None'] # Path to output results outputFile = 'path to output file' # If we don't have to pre-declare the number of names, we don't need this. #MaxNamesPerState = 250 # Using the SSA data, which is in CSV format in .txt files, one per state # With file names '[POSTAL CODE].txt' # Read in number of states, and names of states. dataDir = "directory with state data files" os.chdir(dataDir) fileList = next(os.walk("."))[2] numOfStates = len(fileList) stateList = (map(lambda s:s[0:2],fileList)) stateList.insert(0,'-State totals-') # Create array, (Max Number of names per state+1)x(Number of states + 1) # The +1's are for keeping track of totals. The structure of the array is to be as follows: # Actually, we can use python lists, which allow dynamic insertion. Every time we add a new name # we'll insert a new list of length (numOfStates + 1) # Actually, start with one entry, the totals row. Keep in mind that the first row is totals!!! #______| total | state1 | ... | #total |#totUS |#inS1 | ... | #name1 |#totn1 | ... | #name2 | ... | # The array should be initialized to all zeros. # Also maintain a list of names, whose ordering correspond to the ordering of rows in the number table. nameTable = [[0.]*(numOfStates + 1)] nameList = ['-Name Totals-'] # Go through each state file. Each line has a name, with a year of birth, a sex, and a frequency, ordered # by frequency (followed by alphabetization) # We wish to add names to the table in order of frequency, within the birth range. # To add to the table, check if the name has been added to the table yet. # If it has, update the column for the current state, and all three relevant totals (current state, current name, current) # If it hasn't, insert a new row at the end, and update the same values. for stateIndex in range(1,numOfStates+1): # Open the state's file (note the off by one: stateIndex corresponds to the column in nameTable, which starts at 1 # to account for the totals with open(fileList[stateIndex-1]) as csvfile: statereader = csv.reader(csvfile,delimiter=',') for row in statereader: if row[1] == sex and int(row[2]) <= maxYear and int(row[2]) >= minYear and notExcluded(row[3],ExcludedNames): # Name is valid, add it to the table # May as well keep the list of names sorted, so as to cut down on searches # Log2, baby! name = row[3] numInState = float(row[4]) # Do a binary search for the current name in the list. lindex = 1 uindex = len(nameList)-1 nameIndex = (lindex+uindex)/2 while (lindex <= uindex): nameIndex = (lindex+uindex)/2 # integer division! if nameList[nameIndex] > name: uindex = nameIndex - 1 elif nameList[nameIndex] < name: lindex = nameIndex + 1 else: break # If we didn't find the name in the list, insert a new row for it. In the end, nameIndex should # be the row to update. if nameList[nameIndex] > name: nameList.insert(nameIndex,name) nameTable.insert(nameIndex,[0.]*(numOfStates + 1)) nameIndex = nameIndex elif nameList[nameIndex] < name: nameList.insert(nameIndex + 1,name) nameTable.insert(nameIndex + 1,[0.]*(numOfStates + 1)) nameIndex = nameIndex + 1 # Update the table now that it is the right shape. nameTable[nameIndex][stateIndex] = nameTable[nameIndex][stateIndex] + numInState nameTable[0][stateIndex] = nameTable[0][stateIndex] + numInState nameTable[nameIndex][0] = nameTable[nameIndex][0] + numInState nameTable[0][0] = nameTable[0][0] + numInState # For future reference the code naturally divides here. Before this point we are populating # the table of names, states, and name abundances. AFter this point we are doing analysis # on the table. In principle we could read in the table once, store it in a more convenient form, # and then do multiple analyses on it. # Once the table is fully populated, calculate the statiness and find the max for each row. # Declare a list, (Number of States)x(tuples with name row number and max statiness) # For each state, calculate the statiness for each name, constantly update the max. # Here we are actually keeping track of the top 3 max statiness names. maxStat = [([0.]*(numOfStates + 1)),([0.]*(numOfStates + 1)),([0.]*(numOfStates + 1))] maxStatNames = [(['None']*(numOfStates + 1)),(['None']*(numOfStates + 1)),(['None']*(numOfStates + 1))] totUS = nameTable[0][0] for stateIndex in range(1,numOfStates+1): numInState = nameTable[0][stateIndex] for nameIndex in range(1,len(nameList)): # ns, |s|, nUS, totUS stat = Statiness(float(nameTable[nameIndex][stateIndex]),float(numInState),float(nameTable[nameIndex][0]),float(totUS)) if stat >= maxStat[0][stateIndex]: maxStat[0][stateIndex] = stat maxStatNames[0][stateIndex] = nameList[nameIndex] elif stat >= maxStat[1][stateIndex]: maxStat[1][stateIndex] = stat maxStatNames[1][stateIndex] = nameList[nameIndex] elif stat >= maxStat[2][stateIndex]: maxStat[2][stateIndex] = stat maxStatNames[2][stateIndex] = nameList[nameIndex] # These last lines package together the state-y names together with their respective states and s-values # and then prints it out. results = list(map(list,zip(*([stateList,maxStatNames[0],maxStat[0],maxStatNames[1],maxStat[1],maxStatNames[2],maxStat[2]])))) #print results[1:] # Final result is the list of maximally statey names. If we wanted, we could even return the top 3. with open(outputFile,mode='w') as outfile: writer = csv.writer(outfile,delimiter=',') for row in results[1:]: writer.writerow(row)