Am getting a list index out of range error!!
but I cant add a -1 to the loop.can i??whats wrong with this??
but I cant add a -1 to the loop.can i??whats wrong with this??
Code:
from math import * def parseArray(fn, dataset=1, key='PO', term='/'): ''' Read a formatted data file in matrix format and compile data into a dictionary ''' f = open(fn) # skip to required data set for _ in range(dataset): try: line = f.next() while not line.startswith(key): line = f.next() except StopIteration, e: print 'We have reached the end of the file!' f.close() return False headerList = line.strip().split()[1:] lineList = [] line = f.next().strip() while not line.startswith(term): if line != '': lineList.append(line.strip().split()) line = f.next().strip() f.close() # Key list keys = [i[0] for i in lineList] # Values list values = [[float(s) for s in item] for item in [j[1:] for j in lineList]] # Create a dictionary from keys and values lineDict = dict(zip(keys, values)) dataDict = {} for i, item in enumerate(headerList): dataDict[item] = {} for key in lineDict: dataDict[item][key] = lineDict[key][i] # Add 1.0 to every element in dataDict subdictionaries for keyMain in dataDict: for keySub in dataDict[keyMain]: dataDict[keyMain][keySub] += 1.0 # Normalize original data (with 1 added) and update data valueSums = [sum(item)+4 for item in values] # print valueSums for keyMain in dataDict: for keySub in dataDict[keyMain]: dataDict[keyMain][keySub] /= valueSums[int(keySub)-1] return dataDict def parseData(fn, dataset=1, key='>'): ''' Read a formatted data file of sequences Return a list of sequences The first element in the list is the header ''' # initialize output list dataList = [] # open file for reading f = open(fn) # skip to required data set for _ in range(dataset): try: s = f.next() while not s.startswith(key): s = f.next() except StopIteration, e: print 'We have reached the end of the file!' f.close() return False # initialize output list dataList = [s,] for line in f: if not line.startswith(key): dataList.append(line.strip()) else: break f.close() return dataList if __name__ == '__main__': arraySet = 4 #print arraySet seqSet = 4 #print seqSet value={"A":0.3,"T":0.3,"C":0.2,"G":0.2} fnArray = r'all_redfly.transfac.txt' fnSeq = r'redfly_sequence.fasta' indxSeq=1 while True: dataSeq=parseData(fnSeq,indxSeq) if not dataSeq: break indxArray=1 while True: dataArray = parseArray(fnArray, arraySet) #dataSeq = parseData(fnSeq, seqSet) if not dataArray: break # This is the complete sequence seq = ''.join(dataSeq[1:]) # These are the subkeys of dataArray - '01', '02', '03',............. subKeys = dataArray['A'].keys() subKeys.sort() # Calculate num/den for each slice of sequence # Each sequence slice length = length of subKeys # Example: # seq = 'ATCGATA' # subKeys length = 3 # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA' numList = [] denList = [] seqList = [] for i in xrange(len(seq) - len(subKeys) + 1): subseq = seq[0:len(subKeys)] seqList.append(subseq) num, den = 1, 1 for j, s in enumerate(subseq): num *= dataArray[s][subKeys[j]] den *= value[s] numList.append(num) denList.append(den) seq = seq[1:] resultList = [] for i, num in enumerate(numList): resultList.append(log10(num/denList[i])) indxArray+=1 indxSeq +=1 outStr = '\n'.join(['Sequence = %s Calculation = %0.12f' % (seqList[i], res) for i, res in enumerate(resultList)]) print 'Array set # = %d\nSequence set # = %d' % (arraySet, seqSet) print 'Sequence Header: %s' % dataSeq[0] print outStr
Comment