hey!
That code seens to work but the problem is it is checking only for the first six files... i donno where is the looping problem!!
can you see it??
waiting for ur reply!
cheers!!
That code seens to work but the problem is it is checking only for the first six files... i donno where is the looping problem!!
can you see it??
Code:
from math import *
def parseArray(fn, dataset=1, key='PO', term='/'):
'''
Read a formatted data file in matrix format and
compile data into a dictionary
'''
f = open(fn)
# skip to required data set
for _ in range(dataset):
try:
line = f.next()
while not line.startswith(key):
line = f.next()
except StopIteration, e:
print 'We have reached the end of the file!'
f.close()
return False
headerList = line.strip().split()[1:]
lineList = []
line = f.next().strip()
while not line.startswith(term):
if line != '':
lineList.append(line.strip().split())
line = f.next().strip()
f.close()
# Key list
keys = [i[0] for i in lineList]
# Values list
values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]
# Create a dictionary from keys and values
lineDict = dict(zip(keys, values))
dataDict = {}
for i, item in enumerate(headerList):
dataDict[item] = {}
for key in lineDict:
dataDict[item][key] = lineDict[key][i]
# Add 1.0 to every element in dataDict subdictionaries
for keyMain in dataDict:
for keySub in dataDict[keyMain]:
dataDict[keyMain][keySub] += 1.0
# Normalize original data (with 1 added) and update data
valueSums = [sum(item)+4 for item in values]
# print valueSums
for keyMain in dataDict:
for keySub in dataDict[keyMain]:
dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]
return dataDict
def parseData(fnSeq, dataset=1, key='>'):
'''
Read a formatted data file of sequences
Return a list of sequences
The first element in the list is the header
'''
# initialize output list
dataList = []
# open file for reading
f = open(fnSeq)
# skip to required data set
for _ in range(dataset):
try:
s = f.next()
while not s.startswith(key):
s = f.next()
except StopIteration, e:
print 'We have reached the end of the file!'
f.close()
return False
# initialize output list
dataList = [s,]
for line in f:
if not line.startswith(key):
dataList.append(line.strip())
else:
break
f.close()
return dataList
def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
# sequence factor dictionary
value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
dataArray = parseArray(fnArray, arraySet)
if dataArray:
dataSeq = parseData(fnSeq, seqSet)
if not dataSeq:
return False
else:
return None
# This is the complete sequence
seq = ''.join(dataSeq[1:])
# These are the subkeys of dataArray - '01', '02', '03',.............
subKeys = dataArray['A'].keys()
subKeys.sort()
# Calculate num/den for each slice of sequence
# Each sequence slice length = length of subKeys
# Example:
# seq = 'ATCGATA'
# subKeys length = 3
# 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'
numList = []
denList = []
seqList = []
for i in xrange(len(seq) - len(subKeys)):
subseq = seq[0:len(subKeys)]
seqList.append(subseq)
num, den = 1, 1
for j, s in enumerate(subseq):
num *= dataArray[s][subKeys[j]]
den *= value[s]
numList.append(num)
denList.append(den)
seq = seq[1:]
resultList = []
for i, num in enumerate(numList):
#p=log10(num/denList[i])
#if (p) >=2:
#print "#########",abs(int(p))
#if (log10(num/denList[i]))>=2:
#print "i am here"
resultList.append(int(abs(1)))
#print resultList
#for i in resultList:
#mean=sum(resultList)/len(resultList)
#sub=mean-i
#queue = []
#queue = (sub)**2
#print sqrt(queue/len(resultList))
#print mean,"@@@@@@@@@@"
outStr = '\n'.join(['Sequence = %s Calculation = %d' % (seqList[i], res) for i, res in enumerate(resultList)])
#print "this is line 294"
return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
if __name__ == '__main__':
fnArray ='one_redfly.transfac'
fnSeq = 'deepthi/upstream_regions'
import os
dir_name='upstream_regions'
fList=os.listdir(dir_name)
fList1=[os.path.join(dir_name,f) for f in fList if os.path.isfile(os.path.join(dir_name,f))]
seqSetIndex=0
fnSeq=fList1[seqSetIndex]
while True:
outputfile = "sequence_calc_data.txt"
arraySet = 1
outList = []
calcdata = 1
while not calcdata is None:
seqSet = 1
while True:
calcdata = compileData(fnArray, fnSeq, arraySet, seqSet)
print calcdata
if calcdata:
outList.append(calcdata)
seqSet += 1
else:
break
arraySet += 1
seqSetIndex+=1
else:
break
f = open(outputfile, 'w')
f.write('\n'.join(outList))
f.close()
#f=open(outputfile,"r")
#file_con=f.readlines()
#for line in file_con:
# print line
cheers!!
Comment