""" Run in Python version 2.6.5 on Tara Computing Cluster at UMBC. This code was written by Andrew Coates in partnership with Alexey Ilchenko and advisor Matthias K. Gobbert as a part of the REU Site: Interdisciplinary Program in High Performance Computing at UMBC in Summer 2011. More information can be found in our Tech. Report located at www.umbc.edu/hpcf > Publications as Technical Report HPCF-2011-13. Code last updated August 11, 2011. This code computes the exact value of the expected entropy of a string of length n from a set of size 4 as specifically applied to base pair analysis. To run, start python and enter '' >> from EHn import * '' and then '' >> EHn(pa,pt,pc,pg,n) '' where pa is the probability of a and pt is the probability of t, etc. and n is the number of samples. """ from math import log # crosses two sets and returns result # ie: [a b] X [c d] = [ac ad bc bd] def cross(u,v): cp = [] for a in u: for b in v: cp.append(str(a)+str(b)) return cp # counts the number of As Ts Cs and Gs in a string # and returns a list [#A, #T, #C,#G] def get_count(string): count = [0,0,0,0] for s in string: if s == 'A': count[0]+=1 elif s == 'T': count[1]+=1 elif s=='C': count[2]+= 1 elif s=='G': count[3]+=1 return count # returns the probability associated with a certain string based # on inputted genomic probabilities and the count of each base # in the string def prob(counts,probs): val = 1 for i in range(4): val *= probs[i]**counts[i] return val # returns the entropy of a string based on the observed counts of # bases in the string def Hs(counts,N): val = 0 for i in range(4): prob = float(counts[i])/float(N) if prob != 0: val += -prob*log2(prob) return val # computes log(x) with base 2 def log2(x): return log(x,2) # the main function # inputs the base probabilies and the number of sites # generates an exhaustive list of permutations and # then iterates through them computing the probability # of each string, the entropy of each string, and then # summing up their product # returns the Expected Entropy for n sites # aka returns E(H_n) def EHn(pa,pt,pc,pg,n): delta = ['A','T','C','G'] probs = [pa,pt,pc,pg] temp = delta sum = 0 if n > 1: for i in range(n-1): temp = cross(temp,delta) omega = temp for s in range(len(omega)): counts = get_count(omega[s]) p = prob(counts,probs) h = Hs(counts,n) sum += h*p return sum