""" Run in Python version 2.6.5 on Tara Computing Cluster at UMBC.
This code was written by Andrew Coates in partnership with Alexey
Ilchenko and advisor Matthias K. Gobbert as a part of the REU Site:
Interdisciplinary Program in High Performance Computing at UMBC in
Summer 2011. More information can be found in our Tech. Report located
at www.umbc.edu/hpcf > Publications as Technical Report HPCF-2011-13.
Code last updated August 11, 2011.
This code computes the exact value of the expected entropy of a string
of length n from a set of size 4 as specifically applied to base pair
analysis.
To run, start python and enter '' >> from EHn import * '' and then
'' >> EHn(pa,pt,pc,pg,n) '' where pa is the probability of a and
pt is the probability of t, etc. and n is the number of samples.
"""
from math import log
# crosses two sets and returns result
# ie: [a b] X [c d] = [ac ad bc bd]
def cross(u,v):
cp = []
for a in u:
for b in v:
cp.append(str(a)+str(b))
return cp
# counts the number of As Ts Cs and Gs in a string
# and returns a list [#A, #T, #C,#G]
def get_count(string):
count = [0,0,0,0]
for s in string:
if s == 'A':
count[0]+=1
elif s == 'T':
count[1]+=1
elif s=='C':
count[2]+= 1
elif s=='G':
count[3]+=1
return count
# returns the probability associated with a certain string based
# on inputted genomic probabilities and the count of each base
# in the string
def prob(counts,probs):
val = 1
for i in range(4):
val *= probs[i]**counts[i]
return val
# returns the entropy of a string based on the observed counts of
# bases in the string
def Hs(counts,N):
val = 0
for i in range(4):
prob = float(counts[i])/float(N)
if prob != 0:
val += -prob*log2(prob)
return val
# computes log(x) with base 2
def log2(x):
return log(x,2)
# the main function
# inputs the base probabilies and the number of sites
# generates an exhaustive list of permutations and
# then iterates through them computing the probability
# of each string, the entropy of each string, and then
# summing up their product
# returns the Expected Entropy for n sites
# aka returns E(H_n)
def EHn(pa,pt,pc,pg,n):
delta = ['A','T','C','G']
probs = [pa,pt,pc,pg]
temp = delta
sum = 0
if n > 1:
for i in range(n-1):
temp = cross(temp,delta)
omega = temp
for s in range(len(omega)):
counts = get_count(omega[s])
p = prob(counts,probs)
h = Hs(counts,n)
sum += h*p
return sum