##!/usr/bin/python # run_assoc_test: this file simply takes data from a gwas file and runs a # chi-squared test to determine if any SNPs are statistically significant. # To use: # (1) cd to/the/directory/where/the/.ped/file/is # (2) save this python file in that directory # (3a) run using this command(rename file names as appropriate): # python pythonfile.py pedfile.ped output.txt > output2.txt # this will allow you to store the print statements in a file for later use # Note: the output can be whatever output format you choose # (3b)an alternative command is: # python pythonfile.py pedfile.ped outputfile.txt # this will display all print commands on the command line # Note: the output can be whatever output format you choose # Mollee Jain, (TSRI-STSI Intern, Aug 2013) ###PLEASE READ##### #############################NOTES:########################################### #SCIPY: this file imports stats from the scipy module. This allows us to calculate the p-value from a chi-squared test. This needs to be downloaded before use of this file. For Linux Mint 15 Olivia I simply opened the Synaptic Package Manager and installed scipy. #TYPE OF CHI-SQUARED TEST: This program runs a chi-squared test as a 2X2 contingency table. The calculated p-value is standard - the Fisher Exact Test and the Yates Correction are not utilized in the p-value calculation. #DEGRESS OF FREEDOM:the degrees of freedom (df) for this test is 1. If df for your sample is not 1, you will need to edit this code under the 'pvalue' function. #WHERE TO START: the numbers 6 and 7 indicate where the first pair of nucleotides are. Change these numbers in the 'run' function if the location of the first pair of nucleotides are not at '6'&'7'. To find the location of the first pair of nucleotides - print the line variable on line 122 and count (starting from 0) to see where the first nucleotide is ############################################################################## #imports from sys import argv from collections import Counter from scipy import stats #allows us to calculate the p-value import math #arguments to be provided script, filename, output = argv #script - this file #filename - GWAS file (typically in .ped format; i.e. genotypes.ped) #output - file to write results to (in whatever format you choose; i.e. sampleoutput.txt) ########################DEFINITIONS/LISTS:################################### to_file = open(output, 'w') #output file first = [] #a list of the first nucleotide of the SNP pair(ex.'C' in CG) second = [] #a list of the second nucleo. of the SNP pair(ex. 'G' in CG) chi = [] #a list that will help compute the chi-squared value count1 = [] #a list for the count of nucleotides (1st in the pair) count2 = [] #a list for the count of nucleotides (2nd in the pair) ########################FUNCTIONS:########################################## ### A function to split GWAS file to separate pairs of SNPS (ex.AG) ### def split_snp(x,y): with open(filename) as f: for snpfile in f: line = snpfile.split(" ") #split each line in .ped file by spaces firstpart = line[x] #first nucleo. of the SNP pair secondpart = line[y] #second nucleo. of the SNP pair first.append(firstpart) #append first nucleo. to list 'first' second.append(secondpart) #append second nucelo. to list 'second' f.close() ### A function to calculate the chisquared value of a pair of SNPs ### def chisq(): #a count of whatever is in the column (i.e. count of how many C #and G's are in the first column of the SNPs) firstcounter = Counter(first) secondcounter = Counter(second) count1.append(firstcounter) count2.append(secondcounter) if len(first) == len(second): #a count of items in a row -- total_row = len(first) #the counts should be the same! else: print "Error: the length of each column is not the same." del first[:] #empties lists to allow for lists to be reused del second[:] #setup for a chisquared with expected and observed values for i in firstcounter or secondcounter: total_column = firstcounter[i] + secondcounter[i] #gives total of counts expected = float(total_column*total_row)/(total_row*2) #expected counts formula #(to see observed and expected values, print the line below:) #print i,firstcounter[i],secondcounter[i],"||",expected chisq = "%0.5f"%(((firstcounter[i]-expected)**2/expected) + ((secondcounter[i]-expected)**2/expected)) #calculation of the chi-squared value chi.append(float(chisq)) #appends chi-squared value to a list 'chi' return sum(chi) #returns the value of the chi-squared test ### A function to calculate the p-value of the chi-square value of a SNP pair ### def pvalue(): chisq_value = float(sum(chi)) print "CHI-SQUARED VALUE: " + str(chisq_value) df = float(1) #degrees of freedom - see NOTES p_value = 1 - stats.chi2.cdf(float(chisq_value),df) #using scipy to calculate pvalue print "P-VALUE: " + str(p_value) print count1,count2 significance = 1.0e-8 #alter this variable to output more/less significant #p-values to the output file if p_value < significance: to_file.write(str(count1)) to_file.write(str(count2)) to_file.write("\t") to_file.write(str(p_value)) to_file.write("\t") to_file.write(str(chisq())) to_file.write("\n") else: None ####A function to run the whole program###### def run(x,y): with open(filename) as f: for snpfile in f: line = snpfile.split(" ") #split each line in .ped file by spaces for i in range (6,len(line)-2): #loops over each SNP pair in the data set print "\n" #loops through the length of the gwas file to do a chi-squared test #stops when there are no further nucleotides if x != 6 and y != 7: try: split_snp(x,y) print "PAIR ID: " + str(x) + " and " + str(y) x += 2 y += 2 chisq() pvalue() del count1[:] #empties lists to allow for lists to be reused del count2[:] del chi[:] #signals the for loop to stop except: break #this starts off the loop at the first pair of nucleotides else: print "PAIR ID: " + str(x) + " and " + str(y) split_snp(x,y) x += 2 y += 2 chisq() pvalue() del count1[:] #empties lists to allow for lists to be reused del count2[:] del chi[:] f.close() to_file.write(" Count of nucleotides in SNP Pair ||| ") to_file.write("\t") to_file.write("P-Value ||| ") to_file.write("\t") to_file.write("Chi-Squared Value") to_file.write("\n\n") run(6,7) #runs the whole program.