import os
import csv
from math import log
from scipy.special import gammaln

# ns, |s|, nUS, totUS
# Please pass floats only to this function, otherwise there may be problems with division.
# Log Likelihood, Poisson
def Statiness(ns,s,nUS,US):
	nsb = nUS*float(s)/float(US)
	if ns >= nsb:
		return gammaln(ns+1)-gammaln(nsb+1)-(ns-nsb)*log(nsb)
	else:
		return 0.
		
# Log Likelihood, Binomial dist.
# Chance of finding ns occurences of name out of s names with probability nus/us
#def Statiness(ns,s,nUS,US):
#	nsb = nUS*s/US
#	if ns >= nsb:
#		return -gammaln(s+1)+gammaln(1+ns)+gammaln(1+s-ns)-ns*log(nUS/US)-(s-ns)*log(1-nUS/US)
#	else:
#		return 0.

# Difference of abundances
#def Statiness(ns,s,nUS,US):
#	nsb = nUS*float(s)/float(US)
#	if ns >= nsb:
#		return ns/s-nUS/US
#	else:
#		return 0.
	
def notExcluded(string,exclList):
	for test in exclList:
		if string == test:
			return False
	else:
		return True

# Specify: Year Of Birth range, Sex,
# Max Number of names to consider per state
# list of excluded names (e.g. 'Infant')
minYear = 2010-30
maxYear = 2010-20
sex = 'F'

# Names to exclude from analysis.
ExcludedNames = ['Infant','Baby','Male','Boy','Female','Girl','Unknown','None']

# Path to output results
outputFile = 'path to output file'
# If we don't have to pre-declare the number of names, we don't need this.
#MaxNamesPerState = 250

# Using the SSA data, which is in CSV format in .txt files, one per state
# With file names '[POSTAL CODE].txt'
# Read in number of states, and names of states.
dataDir = "directory with state data files"
os.chdir(dataDir)
fileList = next(os.walk("."))[2]
numOfStates = len(fileList)
stateList = (map(lambda s:s[0:2],fileList))
stateList.insert(0,'-State totals-')

# Create array, (Max Number of names per state+1)x(Number of states + 1)
# The +1's are for keeping track of totals. The structure of the array is to be as follows:
# Actually, we can use python lists, which allow dynamic insertion. Every time we add a new name
# we'll insert a new list of length (numOfStates + 1)
# Actually, start with one entry, the totals row. Keep in mind that the first row is totals!!!

#______| total  | state1 | ... | 
#total |#totUS  |#inS1   | ... |
#name1 |#totn1  | ...          | 
#name2 | ...				   | 

# The array should be initialized to all zeros.

# Also maintain a list of names, whose ordering correspond to the ordering of rows in the number table.
nameTable = [[0.]*(numOfStates + 1)]
nameList = ['-Name Totals-']

# Go through each state file. Each line has a name, with a year of birth, a sex, and a frequency, ordered
# by frequency (followed by alphabetization)
# We wish to add names to the table in order of frequency, within the birth range.
# To add to the table, check if the name has been added to the table yet.
#	If it has, update the column for the current state, and all three relevant totals (current state, current name, current)
#	If it hasn't, insert a new row at the end, and update the same values.
for stateIndex in range(1,numOfStates+1):
	# Open the state's file (note the off by one: stateIndex corresponds to the column in nameTable, which starts at 1
	# to account for the totals 
	with open(fileList[stateIndex-1]) as csvfile:
		statereader = csv.reader(csvfile,delimiter=',')
		for row in statereader:
			if row[1] == sex and int(row[2]) <= maxYear and int(row[2]) >= minYear and notExcluded(row[3],ExcludedNames):
				# Name is valid, add it to the table
				# May as well keep the list of names sorted, so as to cut down on searches
				# Log2, baby!
				name = row[3]
				numInState = float(row[4])
				# Do a binary search for the current name in the list.
				lindex = 1
				uindex = len(nameList)-1
				nameIndex = (lindex+uindex)/2
				while (lindex <= uindex):
					nameIndex = (lindex+uindex)/2	# integer division!
					if nameList[nameIndex] > name:
						uindex = nameIndex - 1
					elif nameList[nameIndex] < name:
						lindex = nameIndex + 1
					else:
						break
				# If we didn't find the name in the list, insert a new row for it. In the end, nameIndex should
				# be the row to update.
				if nameList[nameIndex] > name:
					nameList.insert(nameIndex,name)
					nameTable.insert(nameIndex,[0.]*(numOfStates + 1))
					nameIndex = nameIndex
				elif nameList[nameIndex] < name:
					nameList.insert(nameIndex + 1,name)
					nameTable.insert(nameIndex + 1,[0.]*(numOfStates + 1))
					nameIndex = nameIndex + 1
						
				# Update the table now that it is the right shape.
				nameTable[nameIndex][stateIndex] = nameTable[nameIndex][stateIndex] + numInState
				nameTable[0][stateIndex] = nameTable[0][stateIndex] + numInState
				nameTable[nameIndex][0] = nameTable[nameIndex][0] + numInState
				nameTable[0][0] = nameTable[0][0] + numInState

				
# For future reference the code naturally divides here. Before this point we are populating
# the table of names, states, and name abundances. AFter this point we are doing analysis
# on the table. In principle we could read in the table once, store it in a more convenient form,
# and then do multiple analyses on it.

# Once the table is fully populated, calculate the statiness and find the max for each row.
# Declare a list, (Number of States)x(tuples with name row number and max statiness)
# For each state, calculate the statiness for each name, constantly update the max.
# Here we are actually keeping track of the top 3 max statiness names.
maxStat = [([0.]*(numOfStates + 1)),([0.]*(numOfStates + 1)),([0.]*(numOfStates + 1))]
maxStatNames = [(['None']*(numOfStates + 1)),(['None']*(numOfStates + 1)),(['None']*(numOfStates + 1))]
totUS = nameTable[0][0]
for stateIndex in range(1,numOfStates+1):
	numInState = nameTable[0][stateIndex]
	for nameIndex in range(1,len(nameList)):
		# ns, |s|, nUS, totUS
		stat = Statiness(float(nameTable[nameIndex][stateIndex]),float(numInState),float(nameTable[nameIndex][0]),float(totUS))
		if stat >= maxStat[0][stateIndex]:
			maxStat[0][stateIndex] = stat
			maxStatNames[0][stateIndex] = nameList[nameIndex]
		elif stat >= maxStat[1][stateIndex]:
			maxStat[1][stateIndex] = stat
			maxStatNames[1][stateIndex] = nameList[nameIndex]
		elif stat >= maxStat[2][stateIndex]:
			maxStat[2][stateIndex] = stat
			maxStatNames[2][stateIndex] = nameList[nameIndex]

# These last lines package together the state-y names together with their respective states and s-values
# and then prints it out.
results = list(map(list,zip(*([stateList,maxStatNames[0],maxStat[0],maxStatNames[1],maxStat[1],maxStatNames[2],maxStat[2]]))))
#print results[1:]
			
# Final result is the list of maximally statey names. If we wanted, we could even return the top 3.
with open(outputFile,mode='w') as outfile:
	writer = csv.writer(outfile,delimiter=',')
	for row in results[1:]:
		writer.writerow(row)