I dont seem to get a hang of it. it is not working for all the sequences and all the matrices.what do i do?
waiting for your reply,
cheers!
waiting for your reply,
cheers!
datadict1=datadict.copy() for [B][U]keymain[/U][/B] in datadict: for keysub in datadict[keymain]: datadict1[keymain][keysub] /= (sum(values[int(keysub)-1])+4)
res=1 part="" q=len(p) seqq="" value={"A":0.3,"T":0.3,"C":0.2,"G":0.2} for i in range(q-16): part=p[i:i+16] seqq=part res=1 score=1 for j in range(16): key=seqq[j] res=res*datadict1[key]["%02d"%(j+1)] #print res for key in seqq: score=score * value[key] #print score,"*******************",res log_ratio=log10(res/score) print i,log_ratio
>CG9571_O-E|Drosophila melanogaster|CG9571|FBgn0031086|X:19926374..19927133 CCAGTCCACCGGCCGCCGATCTATTTATACGAGAGGAAGAGGCTGAACTCGAGGATTACCCGTGTATCCTGGGACGCG GATTAGCGATCCATTCCCCTTTTAATCGCCGCGCAAACAGATTCATGAAAGCCTTCGGATTCATTCATTGATCCACAT CTACGGGAACGGGAGTCGCAAACGTTTTCGGATTAGCGCTGGACTAGCGGTTTCTAAATTGGATTATTTCTACCTGAC CCTGGAGCCATCGTCCTCGTCCTCCGTCCCTTAGCGCCTCCTGCATGGATGTCGTTTTTGGGTTTCATACCTTTTCAC ACTGGAAAAATACGGAATTTGTTGTAAGCCCTTTCAAGACGAATGGGATTTAGCTTCGGATGTCAACGTCACCATAAT CATATTAGGAATATTTCTACTCAATTGCAATATTGGTACTTTTCTGACTGTAAACGCGATGATAATTACAAATATGCC TAATTTGCTGTCTTTATAATCAAATGGAGTTCTTTATATTTCCAAAATATTGAAATTCCGATTCCCTAGAAAATAATA CGTTTTTCTGTTATTAATAAAAAACCAATAGGAAAGTTCTCAAAAATTACTCTGTTGTATTTGATCATTTCTTTTCCG GTATAATCTTTTATTTTAAGCATTCCCATGTGAATAAATTTCAGACTAATGTATTAATAAGATGTCGTGTTTTTCCAC TTACAAATTTCTCATACAGCTGGATATATACTACGAGTACTATACACATGCTCTGGG
def parseArray(fn, dataset=1, key='PO', term='/'): ''' Read a formatted data file in matrix format and compile data into a dictionary ''' f = open(fn) # skip to required data set for _ in range(dataset): try: line = f.next() while not line.startswith(key): line = f.next() except StopIteration, e: print 'We have reached the end of the file!' f.close() return False headerList = line.strip().split()[1:] lineList = [] line = f.next().strip() while not line.startswith(term): if line != '': lineList.append(line.strip().split()) line = f.next().strip() f.close() # Key list keys = [i[0] for i in lineList] print keys # Values list values = [[float(s) for s in item] for item in [j[1:] for j in lineList]] # Create a dictionary from keys and values lineDict = dict(zip(keys, values)) dataDict = {} for i, item in enumerate(headerList): dataDict[item] = {} for key in lineDict: dataDict[item][key] = lineDict[key][i] # Add 1.0 to every element in dataDict subdictionaries for keyMain in dataDict: for keySub in dataDict[keyMain]: dataDict[keyMain][keySub] += 1.0 # Normalize original data (with 1 added) and update data valueSums = [sum(item)+4 for item in values] dataDict1=dataDict.copy() for keyMain in dataDict: for keySub in dataDict[keyMain]: dataDict1[keyMain][keySub] /= (sum(valueSums[int(keySub)-1]+4 return dataDict1 def parseData(fn, dataset=1, key='>'): ''' Read a formatted data file of alpha sequences Return a list of sequences ''' # initialize output list dataList = [] # open file for reading f = open(fn) # skip to required data set for _ in range(dataset): try: s = f.next() while not s.startswith(key): s = f.next() except StopIteration, e: print 'We have reached the end of the file!' f.close() return False for line in f: if not line.startswith(key): dataList.append(line.strip()) else: break f.close() return dataList if __name__ == '__main__': value={"A":0.3,"T":0.3,"C":0.2,"G":0.2} fnArray = 'all_redfly.transfac.txt' fnSeq = 'redfly_sequence.fasta' dataset = 3 dataArray = parseArray(fnArray, dataset) dataSeq = parseData(fnSeq, dataset) seq = ''.join(dataSeq[1:]) subKeys = dataArray['A'].keys() subKeys.sort() i,j = divmod(len(seq), len(subKeys)) keys = subKeys*i + subKeys[:j] print dataSeq[0], outList = ['%s[%s]*%s = %0.4f' % (s, keys[i], s, dataArray[s][keys[i]]*value[s]) for i, s in enumerate(seq)] print '\n'.join(outList) print sum([float(s.split('=')[1]) for s in outList])
>CG9571_O-E|Drosophila melanogaster|CG9571|FBgn0031086|X:19926374..19927133 CCAGTCCACCGGCCGCCGATCTATTTATACGAGAGGAAGAGGCTGAACTCGAGGATTACCCGTGTATCCTGGGACGCG GATTAGCGATCCATTCCCCTTTTAATCGCCGCGCAAACAGATTCATGAAAGCCTTCGGATTCATTCATTGATCCACAT CTACGGGAACGGGAGTCGCAAACGTTTTCGGATTAGCGCTGGACTAGCGGTTTCTAAATTGGATTATTTCTACCTGAC CCTGGAGCCATCGTCCTCGTCCTCCGTCCCTTAGCGCCTCCTGCATGGATGTCGTTTTTGGGTTTCATACCTTTTCAC ACTGGAAAAATACGGAATTTGTTGTAAGCCCTTTCAAGACGAATGGGATTTAGCTTCGGATGTCAACGTCACCATAAT CATATTAGGAATATTTCTACTCAATTGCAATATTGGTACTTTTCTGACTGTAAACGCGATGATAATTACAAATATGCC TAATTTGCTGTCTTTATAATCAAATGGAGTTCTTTATATTTCCAAAATATTGAAATTCCGATTCCCTAGAAAATAATA CGTTTTTCTGTTATTAATAAAAAACCAATAGGAAAGTTCTCAAAAATTACTCTGTTGTATTTGATCATTTCTTTTCCG GTATAATCTTTTATTTTAAGCATTCCCATGTGAATAAATTTCAGACTAATGTATTAATAAGATGTCGTGTTTTTCCAC TTACAAATTTCTCATACAGCTGGATATATACTACGAGTACTATACACATGCTCTGGG
def parseArray(fn, dataset=1, key='PO', term='/'): ''' Read a formatted data file in matrix format and compile data into a dictionary ''' ..................................... ..................................... print '\n'.join(outList) print sum([float(s.split('=')[1]) for s in outList])
>CG9571_O-E|Drosophila melanogaster|CG9571|FBgn0031086|X:19926374..19927133 C[01]/C = 1.922438822066 C[02]/C = 0.207382828702 A[03]/A = 0.939745715865 G[04]/G = 0.746578183326 T[05]/T = 1.200055302088 C[06]/C = 0.207296849088 C[07]/C = 0.327664869349 A[08]/A = 2.918567675930 C[09]/C = 0.217751970137 C[10]/C = 0.207382828702 G[11]/G = 0.207382828702 G[12]/G = 0.207382828702 C[13]/C = 0.279850746269 C[14]/C = 0.709249274160 G[15]/G = 3.334715885525 C[16]/C = 0.300705101618 C[01]/C = 1.922438822066 G[02]/G = 2.285358772294 A[03]/A = 0.939745715865 T[04]/T = 0.138255219135 C[05]/C = 0.207382828702 T[06]/T = 0.138197899392 A[07]/A = 0.145167980091 T[08]/T = 0.138255219135 T[09]/T = 0.138255219135 T[10]/T = 0.138255219135 A[11]/A = 0.138255219135 T[12]/T = 0.138255219135 A[13]/A = 2.209784411277 C[14]/C = 0.709249274160 G[15]/G = 3.334715885525 A[16]/A = 1.137840453477 G[01]/G = 0.207382828702 A[02]/A = 0.138255219135 G[03]/G = 0.207296849088 G[04]/G = 0.746578183326
>>> result 2.4243160673268744e-210 >>>
q=len(p) value={"A":0.3,"T":0.3,"C":0.2,"G":0.2} for i in range(q-len(datadict['A'])): part=p[i:i+len(datadict['A'])] seqq=part res=1 score=1 for j in range(len(datadict['A'])): key=seqq[j] res=res*datadict[key]["%02d"%(j+1)] for key in seqq: score=score * value[key] log_ratio=log10(res/score) print i,log_ratio
>>> 0.3**16 4.3046720999999976e-009
Comment