looping through a big file containing a set of files.

**aboxylica** · Jul 17 '07, 04:46 PM

Am getting a list index out of range error!!
but I cant add a -1 to the loop.can i??whats wrong with this??

Code:

from math import *
def parseArray(fn, dataset=1, key='PO', term='/'):

    '''

    Read a formatted data file in matrix format and

    compile data into a dictionary

    '''

    f = open(fn)

 

    # skip to required data set

    for _ in range(dataset):
    

        try:

            line = f.next()

            while not line.startswith(key):

                line = f.next()

        except StopIteration, e:

            print 'We have reached the end of the file!'

            f.close()

            return False

 

    headerList = line.strip().split()[1:]

    lineList = []

 

    line = f.next().strip()

    while not line.startswith(term):

        if line != '':

            lineList.append(line.strip().split())

        line = f.next().strip()

 

    f.close()

 

    # Key list

    keys = [i[0] for i in lineList]

    # Values list

    values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]

 

    # Create a dictionary from keys and values

    lineDict = dict(zip(keys, values))

 

    dataDict = {}

 

    for i, item in enumerate(headerList):

        dataDict[item] = {}

        for key in lineDict:

            dataDict[item][key] = lineDict[key][i]

 

    # Add 1.0 to every element in dataDict subdictionaries

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:

            dataDict[keyMain][keySub] += 1.0

 

    # Normalize original data (with 1 added) and update data

    valueSums = [sum(item)+4 for item in values]

    # print valueSums

 

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:
            dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]

 
    return dataDict

 

 

def parseData(fn, dataset=1, key='>'):

    '''

    Read a formatted data file of sequences

    Return a list of sequences

    The first element in the list is the header

    '''   

    # initialize output list

    dataList = []

   

    # open file for reading

    f = open(fn)

   

    # skip to required data set

    for _ in range(dataset):

        try:

            s = f.next()

            while not s.startswith(key):

                s = f.next()

        except StopIteration, e:

            print 'We have reached the end of the file!'

            f.close()

            return False

 

    # initialize output list

    dataList = [s,]

       
    for line in f:

        if not line.startswith(key):

            dataList.append(line.strip())

        else:

            break

 

    f.close()

    return dataList

 

if __name__ == '__main__':

 

    arraySet = 4
    #print arraySet

    seqSet = 4
    #print seqSet

 

    value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}

      

    fnArray = r'all_redfly.transfac.txt'

    fnSeq = r'redfly_sequence.fasta'
    indxSeq=1
    while True:
        dataSeq=parseData(fnSeq,indxSeq)
        if not dataSeq:
            break
        indxArray=1
        while True:
                dataArray = parseArray(fnArray, arraySet)
                #dataSeq = parseData(fnSeq, seqSet)
                if not dataArray:
                    break
                # This is the complete sequence
                seq = ''.join(dataSeq[1:])
                # These are the subkeys of dataArray - '01', '02', '03',.............
                subKeys = dataArray['A'].keys()
                subKeys.sort()

 

    # Calculate num/den for each slice of sequence

    # Each sequence slice length = length of subKeys

    # Example:

    # seq = 'ATCGATA'

    # subKeys length = 3

    # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'

                numList = []

                denList = []

                seqList = []

                for i in xrange(len(seq) - len(subKeys) + 1):

                    subseq = seq[0:len(subKeys)]

                    seqList.append(subseq)
                    num, den = 1, 1

                    for j, s in enumerate(subseq):

                        num *= dataArray[s][subKeys[j]]

                        den *= value[s]

                        numList.append(num)

                        denList.append(den)

                        seq = seq[1:]

 

                        resultList = []

                        for i, num in enumerate(numList):
        
                            resultList.append(log10(num/denList[i]))
                    indxArray+=1
                indxSeq +=1
                
                outStr = '\n'.join(['Sequence = %s Calculation = %0.12f' % (seqList[i], res) for i, res in enumerate(resultList)])
                print 'Array set # = %d\nSequence set # = %d' % (arraySet, seqSet)
                print 'Sequence Header: %s' % dataSeq[0]
                print outStr

**bvdet** · Jul 18 '07, 12:50 AM

Let's make a new function, iterate on it, and write the results to a file:
[code=Python]def compileData(fnA rray, fnSeq, arraySet=1, seqSet=1):
# sequence factor dictionary
value={"A":0.3, "T":0.3,"C":0.2 ,"G":0.2}

dataArray = parseArray(fnAr ray, arraySet)
if dataArray:
dataSeq = parseData(fnSeq , seqSet)
if not dataSeq:
return False
else:
return None

# This is the complete sequence
seq = ''.join(dataSeq[1:])
# These are the subkeys of dataArray - '01', '02', '03',.......... ...
subKeys = dataArray['A'].keys()
subKeys.sort()

# Calculate num/den for each slice of sequence
# Each sequence slice length = length of subKeys
# Example:
# seq = 'ATCGATA'
# subKeys length = 3
# 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'
numList = []
denList = []
seqList = []
for i in xrange(len(seq) - len(subKeys) + 1):
subseq = seq[0:len(subKeys)]
seqList.append( subseq)
num, den = 1, 1
for j, s in enumerate(subse q):
num *= dataArray[s][subKeys[j]]
den *= value[s]
numList.append( num)
denList.append( den)
seq = seq[1:]

resultList = []
for i, num in enumerate(numLi st):
resultList.appe nd(num/denList[i])

outStr = '\n'.join(['Sequence = %s Calculation = %0.12f' % (seqList[i], res) for i, res in enumerate(resul tList)])
return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)

if __name__ == '__main__':

fnArray = 'array.txt'
fnSeq = 'seq.txt'

outputfile = 'sequence_calc_ data.txt'

arraySet = 1
outList = []
calcdata = 1
while not calcdata is None:
seqSet = 1
while True:
calcdata = compileData(fnA rray, fnSeq, arraySet, seqSet)
if calcdata:
outList.append( calcdata)
seqSet += 1
else:
break
arraySet += 1

f = open(outputfile , 'w')
f.write('\n'.jo in(outList))
f.close() [/code]This resulted in a 3.1 mb file. Following are the first few lines of the first and last compilation:

Code:

Array set # = 1
Sequence set # = 1
Sequence Header: >CG9571_O-E|Drosophila melanogaster|CG9571|FBgn0031086|X:19926374..19927133

Sequence = CCAGTCCACCGGCCGC Calculation = 0.000025722315
Sequence = CAGTCCACCGGCCGCC Calculation = 0.000000000318
Sequence = AGTCCACCGGCCGCCG Calculation = 0.000595631200
Sequence = GTCCACCGGCCGCCGA Calculation = 0.000120125057
Sequence = TCCACCGGCCGCCGAT Calculation = 0.000000089016
...........................
Array set # = 4
Sequence set # = 8
Sequence Header: >Obp19b_prom|Drosophila melanogaster|Obp19b|FBgn0031110|X:20224439..20227440

Sequence = ATTGCTGACGGGTCGA Calculation = 0.000005535136
Sequence = TTGCTGACGGGTCGAA Calculation = 0.000003984295
Sequence = TGCTGACGGGTCGAAT Calculation = 0.000053179344
Sequence = GCTGACGGGTCGAATG Calculation = 0.000031549069
.............................

**aboxylica** · Jul 18 '07, 04:17 AM

THis is the code.my o/p is an empty array.why is this happening?

Code:

from math import *
def parseArray(fn, dataset=1, key='PO', term='/'):

    '''

    Read a formatted data file in matrix format and

    compile data into a dictionary

    '''

    f = open(fn)

 

    # skip to required data set

    for _ in range(dataset):
    

        try:

            line = f.next()

            while not line.startswith(key):

                line = f.next()

        except StopIteration, e:

            print 'We have reached the end of the file!'

            f.close()

            return False

 

    headerList = line.strip().split()[1:]

    lineList = []

 

    line = f.next().strip()

    while not line.startswith(term):

        if line != '':

            lineList.append(line.strip().split())

        line = f.next().strip()

 

    f.close()

 

    # Key list

    keys = [i[0] for i in lineList]

    # Values list

    values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]

 

    # Create a dictionary from keys and values

    lineDict = dict(zip(keys, values))

 

    dataDict = {}

 

    for i, item in enumerate(headerList):

        dataDict[item] = {}

        for key in lineDict:

            dataDict[item][key] = lineDict[key][i]

 

    # Add 1.0 to every element in dataDict subdictionaries

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:

            dataDict[keyMain][keySub] += 1.0

 

    # Normalize original data (with 1 added) and update data

    valueSums = [sum(item)+4 for item in values]

    # print valueSums

 

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:
            dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]

 
    return dataDict

 

 

def parseData(fn, dataset=1, key='>'):

    '''

    Read a formatted data file of sequences

    Return a list of sequences

    The first element in the list is the header

    '''   

    # initialize output list

    dataList = []

   

    # open file for reading

    f = open(fn)

   

    # skip to required data set

    for _ in range(dataset):

        try:

            s = f.next()

            while not s.startswith(key):

                s = f.next()

        except StopIteration, e:

            print 'We have reached the end of the file!'

            f.close()

            return False

 

    # initialize output list

    dataList = [s,]

       
    for line in f:

        if not line.startswith(key):

            dataList.append(line.strip())

        else:

            break

 

    f.close()

    return dataList

 

   
def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
   
    # sequence factor dictionary
   
    value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
   
         
   
    dataArray = parseArray(fnArray, arraySet)
   
    if dataArray:

        dataSeq = parseData(fnSeq, seqSet)
   
        if not dataSeq:
   
            return False
  
        else:
  
            return None
  
         
  
        # This is the complete sequence 
  
        seq = ''.join(dataSeq[1:])

        # These are the subkeys of dataArray - '01', '02', '03',.............
  
        subKeys = dataArray['A'].keys()

        subKeys.sort()
  
       
  
        # Calculate num/den for each slice of sequence
  
          # Each sequence slice length = length of subKeys
  
          # Example:
            # seq = 'ATCGATA'
  
          # subKeys length = 3
  
          # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'

        numList = []
  
        denList = []
  
        seqList = []
  
        for i in xrange(len(seq) - len(subKeys) + 1):
  
            subseq = seq[0:len(subKeys)]
  
            seqList.append(subseq)
  
            num, den = 1, 1
  
            for j, s in enumerate(subseq):
  
                num *= dataArray[s][subKeys[j]]
  
                den *= value[s]
  
                numList.append(num)
  
                denList.append(den)
  
                seq = seq[1:]
  
       
  
        resultList = []
  
        for i, num in enumerate(numList):

            resultList.append(num/denList[i])
  
       
  
            outStr = '\n'.join(['Sequence = %s Calculation = %0.12f' % (seqList[i], res)   for i, res in enumerate(resultList)])
  
            return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
  
if __name__ == '__main__':
  
         
    fnArray =r'all_redfly.transfac' 
    fnSeq = r'redfly_sequence.fasta'
  
    outputfile =  "sequence_calc_data.txt"
  
         
  
    arraySet = 1
  
    outList = []
  
    calcdata = 1
  
    while not calcdata is None:
  
        seqSet = 1
  
        while True:
  
            calcdata = compileData(fnArray, fnSeq, arraySet, seqSet)
            print calcdata
  
            if calcdata:
  
                outList.append(calcdata)
  
                seqSet += 1
  
            else:
  
                break
  
        arraySet += 1
        
  
       
  
    f = open(outputfile, 'w')
  
    f.write('\n'.join(outList))
  
    f.close()
    f=open(outputfile,"r")
    file_con=f.readlines()
    print file_con
    for line in file_con:
        print line

**aboxylica** · Jul 18 '07, 06:11 AM

I seem to get an list index out of range error:
Traceback (most recent call last):
File "newbie1.py ", line 311, in <module>
calcdata = compileData(fnA rray, fnSeq, arraySet, seqSet)
File "newbie1.py ", line 285, in compileData
outStr = '\n'.join(['Sequence = %s Calculation = %0.12f' % (seqList[i], res)for i, res in enumerate(resul tList)])
IndexError: list index out of range

Code:

from math import *
def parseArray(fn, dataset=1, key='PO', term='/'):

    '''

    Read a formatted data file in matrix format and

    compile data into a dictionary

    '''

    f = open(fn)

 

    # skip to required data set

    for _ in range(dataset):
    

        try:

            line = f.next()

            while not line.startswith(key):

                line = f.next()

        except StopIteration, e:

            print 'We have reached the end of the file!'

            f.close()

            return False

 

    headerList = line.strip().split()[1:]
    

    lineList = []

 

    line = f.next().strip()

    while not line.startswith(term):

        if line != '':

            lineList.append(line.strip().split())


        line = f.next().strip()

 

    f.close()

 

    # Key list

    keys = [i[0] for i in lineList]

    # Values list

    values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]

 

    # Create a dictionary from keys and values

    lineDict = dict(zip(keys, values))

 

    dataDict = {}

 

    for i, item in enumerate(headerList):

        dataDict[item] = {}

        for key in lineDict:

            dataDict[item][key] = lineDict[key][i]

 

    # Add 1.0 to every element in dataDict subdictionaries

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:

            dataDict[keyMain][keySub] += 1.0

 

    # Normalize original data (with 1 added) and update data

    valueSums = [sum(item)+4 for item in values]

    # print valueSums

 

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:
            dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]

    return dataDict

 

 

def parseData(fn, dataset=1, key='>'):

    '''

    Read a formatted data file of sequences

    Return a list of sequences

    The first element in the list is the header

    '''   

    # initialize output list

    dataList = []

   

    # open file for reading

    f = open(fn)

   

    # skip to required data set

    for _ in range(dataset):


        try:

            s = f.next()

            while not s.startswith(key):
            

                s = f.next()

        except StopIteration, e:

            print 'We have reached the end of the file!'

            f.close()

            return False

 

    # initialize output list

    dataList = [s,]

       
    for line in f:

        if not line.startswith(key):

            dataList.append(line.strip())

        else:

            break

 

    f.close()

    return dataList


 

   
def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
   
    # sequence factor dictionary
   
    value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
   
         
   
    dataArray = parseArray(fnArray, arraySet)
   
    if dataArray:

        dataSeq = parseData(fnSeq, seqSet)

   
        if not dataSeq:
   
            return False
  
    else:
  
        return None
    
  
         
  
    # This is the complete sequence 
  
    seq = ''.join(dataSeq[1:])
    


    # These are the subkeys of dataArray - '01', '02', '03',.............
  
    subKeys = dataArray['A'].keys()

    subKeys.sort()
    
  
       
  
    # Calculate num/den for each slice of sequence
  
    # Each sequence slice length = length of subKeys
  
    # Example:
    # seq = 'ATCGATA'
  
    # subKeys length = 3
  
    # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'

    numList = []
  
    denList = []
  
    seqList = []
  
    for i in xrange(len(seq) - len(subKeys) + 1):
  
        subseq = seq[0:len(subKeys)]
  
        seqList.append(subseq)

  
        num, den = 1, 1
  
        for j, s in enumerate(subseq):
  
            num *= dataArray[s][subKeys[j]]
  
            den *= value[s]
  
            numList.append(num)
  
            denList.append(den)
  
            seq = seq[1:]
  
       
  
    resultList = []
  
    for i, num in enumerate(numList):

        resultList.append(log10(num/denList[i]))
        print (resultList)
  
       
  
    outStr = '\n'.join(['Sequence = %s Calculation = %0.12f' % (seqList[i], res)for i, res in enumerate(resultList)])
  
    return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
  
if __name__ == '__main__':
  

    fnArray ='all_redfly.transfac' 
    fnSeq = 'redfly_sequence.fasta'
  
    outputfile =  "sequence_calc_data.txt"
  
         
  
    arraySet = 1
  
    outList = []
  
    calcdata = 1
  
    while not calcdata is None:
  
        seqSet = 1
  
        while True:
  
            calcdata = compileData(fnArray, fnSeq, arraySet, seqSet)
  
            if calcdata:
  
                outList.append(calcdata)
  
                seqSet += 1
  
            else:
  
                break
  
        arraySet += 1

        
  
       
  
    f = open(outputfile, 'w')
  
    f.write('\n'.join(outList))
  
    f.close()
    f=open(outputfile,"r")
    file_con=f.readlines()
    print file_con
    for line in file_con:
        print line

waiting for ur reply,
cheers!

**bvdet** · Jul 18 '07, 12:54 PM

I am not sure why you add so many spaces in between the lines of code. I personally find it unreadable. Anyway, when you were adding all the spaces, some of the code ended up at the incorrect indentation:[code=Python]........for j, s in enumerate(subse q):

num *= dataArray[s][subKeys[j]]

den *= value[s]

numList.append( num)

denList.append( den)

seq = seq[1:]
[/code]SHOULD BE:[code=Python]........for j, s in enumerate(subse q):
num *= dataArray[s][subKeys[j]]
den *= value[s]
numList.append( num)
denList.append( den)
seq = seq[1:][/code]

**aboxylica** · Jul 18 '07, 01:32 PM

hey,
That was the mistake.amazing !! thanks a million!!
I got some doubts about the program.
i have some doubts. first understandingan d then get back to you.
THANKS A MILLION!
cheers!!

**aboxylica** · Jul 18 '07, 06:01 PM

hey,
here is the code where I tried removing the try catch block and couple of things which will make it easier for me to understand.but looks like there is some problem ..I will of course use them in my main program.But I was just trying to understand when I tried executing the iteration was not happening and when I said
print outList instead of storing it in a file it was not iterating.This is the code
can you tell me whats happening???

Code:

from math import *
def parseArray(fn,dataset=1,key='PO',term='/'):
    f=open(fn)
    for _ in range(dataset):
        line=f.next()
        while not line.startswith(key):
            line=f.next()
    headerList=line.strip().split()[1:]
    lineList=[]
    line=f.next().strip()
    while not line.startswith(term):
        if line!='':
            lineList.append(line.strip().split())
        line=f.next().strip()
        # f.close()
    keys=[i[0] for i in lineList]
    values=[[float(s) for s in item] for item in [j[1:] for j in lineList]]
    lineDict=dict(zip(keys,values))
    dataDict={}
    for i,item in enumerate(headerList):
        dataDict[item]={}
        for key in lineDict:
            dataDict[item][key]=lineDict[key][i]
    for keyMain in dataDict:
        for keySub in dataDict[keyMain]:
            dataDict[keyMain][keySub]+=1.0
    valueSums=[sum(item)+4 for item in values]
    for keyMain in dataDict:
        for keySub in dataDict[keyMain]:
            dataDict[keyMain][keySub]/=valueSums[int(keySub)-1]
    return dataDict
#fn="weight_matrix.transfac.txt"
#p=parseArray(fn)
#print p
def parseData(fn,dataset=1,key='>'):
    dataList=[]
    f=open(fn)
    for _ in range(dataset):
        s=f.next()
    dataList=[s,]
    
    for line in f:
        if not line.startswith(key):
            dataList.append(line.strip())
        else:
            break
    return dataList
#fn="redfly_sequence.fasta"
#p=parseData(fn)
#print p
def compileData(fnArray,fnSeq,arraySet=1,seqSet=1):
    value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
    dataArray=parseArray(fnArray,arraySet)
    if dataArray:
        dataSeq=parseData(fnSeq,seqSet)
    seq=''.join(dataSeq[1:])
    subKeys=dataArray['A'].keys()
    subKeys.sort()
    numList=[]
    denList=[]
    seqList=[]
    for i in xrange(len(seq)-len(subKeys)):
        subseq=seq[0:len(subKeys)]
        seqList.append(subseq)
        num,den=1,1
        for j,s in enumerate(subseq):
            num*=dataArray[s][subKeys[j]]
            den*=value[s]
        numList.append(num)
        denList.append(den)
        seq=seq[1:]
    resultList=[]
    for i,num in enumerate(numList):
        if (log10(num/denList[i]))>2:
            resultList.append(log10(num/denList[i]))
    outStr='\n'.join(['sequence=%s Calculation=%0.12f'%(seqList[i],res) for i,res in enumerate(resultList)])
    return 'array set#= %d\nSequence set #=%d\nSequence Header: %s\n%s' %(arraySet,seqSet,dataSeq[0],outStr)
fnArray='weight_matrix.transfac.txt'
fnSeq='redfly_sequence.fasta'
arraySet=1
outList=[]
calcdata=1
while not calcdata is None:
    seqSet=1
    while True:
        calcdata=compileData(fnArray,fnSeq,arraySet,seqSet)
        if calcdata:
            outList.append(calcdata)
            
    
            seqSet+=1
        else:
            break
        
    arraySet+=1
print outList
f=open(outputfile,'w')
f.write('/n'.join(outList))
f.close()

waiting
cheers!!

**bvdet** · Jul 18 '07, 08:17 PM

After running the script, I can do this:[code=Python]>>> print outList[1]
Array set # = 1
Sequence set # = 2
Sequence Header: >Cp36_DRR|Droso phila melanogaster|Cp 36|FBgn0000359| X:8323349..8324 136

Sequence = AGTCGACCAGCACGA G Calculation = -0.872390330485
Sequence = GTCGACCAGCACGAG A Calculation = -3.287525755636
Sequence = TCGACCAGCACGAGA T Calculation = -4.346213357398
Sequence = CGACCAGCACGAGAT C Calculation = -2.329064001005
............... ..........[/code]I don't want to print the entire outList because it's over 3 MB.
You may have changed something you should not have. Maybe you should copy the code again. If you need to change things, change only one thing at a time and test to make sure it still works.

**aboxylica** · Dec 11 '07, 07:10 AM

hello!
I hope you people remember the problem above..
i got little problems with that
that was just opening a file containing files..now il be opening a directory containing different sequence files
this is how the code looks now!
am trying to change the i/p file to folder by showing the path of the folder but its going to the exception file..can you tell me why?

Code:

from math import *
def parseArray(fn, dataset=1, key='PO', term='/'):

    '''

    Read a formatted data file in matrix format and

    compile data into a dictionary

    '''

    f = open(fn)

 

    # skip to required data set

    for _ in range(dataset):
    

        try:

            line = f.next()

            while not line.startswith(key):

                line = f.next()

        except StopIteration, e:

            print 'We have reached the end of the file!'

            f.close()

            return False

 

    headerList = line.strip().split()[1:]
    

    lineList = []

 

    line = f.next().strip()

    while not line.startswith(term):

        if line != '':

            lineList.append(line.strip().split())


        line = f.next().strip()

 

    f.close()

 

    # Key list

    keys = [i[0] for i in lineList]

    # Values list

    values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]

 

    # Create a dictionary from keys and values

    lineDict = dict(zip(keys, values))

 

    dataDict = {}

 

    for i, item in enumerate(headerList):

        dataDict[item] = {}

        for key in lineDict:

            dataDict[item][key] = lineDict[key][i]

 

    # Add 1.0 to every element in dataDict subdictionaries

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:

            dataDict[keyMain][keySub] += 1.0

 

    # Normalize original data (with 1 added) and update data

    valueSums = [sum(item)+4 for item in values]

    # print valueSums

 

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:
            dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]

    return dataDict

 

 

def parseData(fn, dataset=1, key='>'):

    '''

    Read a formatted data file of sequences

    Return a list of sequences

    The first element in the list is the header

    '''   

    # initialize output list

    dataList = []

   

    # open file for reading

    f = open(fn)

   

    # skip to required data set

    for _ in range(dataset):


        try:

            s = f.next()

            while not s.startswith(key):
            

                s = f.next()

        except StopIteration, e:

            print 'We have reached the end of the file!'

            f.close()

            return False

 

    # initialize output list

    dataList = [s,]

       
    for line in f:

        if not line.startswith(key):

            dataList.append(line.strip())

        else:

            break

 

    f.close()

    return dataList


 

   
def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
   
    # sequence factor dictionary
   
    value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
   
         
   
    dataArray = parseArray(fnArray, arraySet)

   
    if dataArray:

        dataSeq = parseData(fnSeq, seqSet)

   
        if not dataSeq:
   
            return False
  
    else:
  
        return None
    
  
         
  
    # This is the complete sequence 
  
    seq = ''.join(dataSeq[1:])

    
    


    # These are the subkeys of dataArray - '01', '02', '03',.............
  
    subKeys = dataArray['A'].keys()

    subKeys.sort()

    
  
       
  
    # Calculate num/den for each slice of sequence
  
    # Each sequence slice length = length of subKeys
  
    # Example:
    # seq = 'ATCGATA'
  
    # subKeys length = 3
  
    # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'

    numList = []
  
    denList = []
  
    seqList = []
  
    for i in xrange(len(seq) - len(subKeys)):
  
        subseq = seq[0:len(subKeys)]
  
        seqList.append(subseq)

  
        num, den = 1, 1
  
        for j, s in enumerate(subseq):
  
            num *= dataArray[s][subKeys[j]]
  
            den *= value[s]
  
        numList.append(num)
  
        denList.append(den)
  
        seq = seq[1:]
  
       
    
    resultList = []
  
    for i, num in enumerate(numList):
        
        if (log10(num/denList[i]))>=2:
          
	    resultList.append(int(abs(1)))
   
        
   
       
  
    outStr = '\n'.join(['Sequence = %s Calculation = %d' % (seqList[i], res) for i, res in enumerate(resultList)])
    
    
  
    return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
    
  
if __name__ == '__main__':
    
  

    fnArray ='half.txt'
   
    fnSeq = 'C:\\python25\ding\YAL005C.txt'
    
    
  
    outputfile =  "sequence_calc_data.txt"
  
         
  
    arraySet = 1
  
    outList = []
  
    calcdata = 1
  
    while not calcdata is None:
  
        seqSet = 1
  
        while True:
  
            calcdata = compileData(fnArray, fnSeq, arraySet, seqSet)
            print calcdata
  
            if calcdata:
  
                outList.append(calcdata)
  
                seqSet += 1
  
            else:
  
                break
  
        arraySet += 1

        
  
       
  
    f = open(outputfile, 'w')
  
    f.write('\n'.join(outList))
  
    f.close()
    #f=open(outputfile,"r")
    #file_con=f.readlines()
    #for line in file_con:
     #   print line

please tell me what can i do??

**aboxylica** · Dec 11 '07, 11:00 AM

here is my code which is reading a directory containing files..... it seems to go to the exception part always.. i dono why..i think it checks for the first file in the folder and then comes out..how do i check if its going to all the files..

Code:

from math import *
def parseArray(fn, dataset=1, key='PO', term='/'):

    '''

    Read a formatted data file in matrix format and

    compile data into a dictionary

    '''

    f = open(fn)

 

    # skip to required data set

    for _ in range(dataset):
    

        try:

            line = f.next()
            print "am here"

            while not line.startswith(key):
                print "oh yes"

                line = f.next()

        except StopIteration, e:
            print '###############################'

            print 'We have reached the end of the file!'

            f.close()

            return False

 

    headerList = line.strip().split()[1:]
    

    lineList = []

 

    line = f.next().strip()

    while not line.startswith(term):

        if line != '':

            lineList.append(line.strip().split())


        line = f.next().strip()

 

    f.close()

 

    # Key list

    keys = [i[0] for i in lineList]

    # Values list

    values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]

 

    # Create a dictionary from keys and values

    lineDict = dict(zip(keys, values))

 

    dataDict = {}

 

    for i, item in enumerate(headerList):

        dataDict[item] = {}

        for key in lineDict:

            dataDict[item][key] = lineDict[key][i]

 

    # Add 1.0 to every element in dataDict subdictionaries

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:

            dataDict[keyMain][keySub] += 1.0

 

    # Normalize original data (with 1 added) and update data

    valueSums = [sum(item)+4 for item in values]

    # print valueSums

 

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:
            dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]

    return dataDict

 

 

def parseData(fn, dataset=1, key='>'):

    '''

    Read a formatted data file of sequences

    Return a list of sequences

    The first element in the list is the header

    '''   

    # initialize output list

    dataList = []

   

    # open file for reading

    f = open(fn)

   

    # skip to required data set

    for _ in range(dataset):


        try:

            s = f.next()

            while not s.startswith(key):
            

                s = f.next()

        except StopIteration, e:

            print 'We have reached the end of the file!'
            print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'

            f.close()

            return False

 

    # initialize output list

    dataList = [s,]

       
    for line in f:

        if not line.startswith(key):

            dataList.append(line.strip())

        else:

            break

 

    f.close()

    return dataList


 

   
def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
   
    # sequence factor dictionary
   
    value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
   
         
   
    dataArray = parseArray(fnArray, arraySet)

   
    if dataArray:

        dataSeq = parseData(fnSeq, seqSet)

   
        if not dataSeq:
   
            return False
  
    else:
  
        return None
    
  
         
  
    # This is the complete sequence 
  
    seq = ''.join(dataSeq[1:])

    
    


    # These are the subkeys of dataArray - '01', '02', '03',.............
  
    subKeys = dataArray['A'].keys()

    subKeys.sort()

    
  
       
  
    # Calculate num/den for each slice of sequence
  
    # Each sequence slice length = length of subKeys
  
    # Example:
    # seq = 'ATCGATA'
  
    # subKeys length = 3
  
    # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'

    numList = []
  
    denList = []
  
    seqList = []
  
    for i in xrange(len(seq) - len(subKeys)):
  
        subseq = seq[0:len(subKeys)]
  
        seqList.append(subseq)

  
        num, den = 1, 1
  
        for j, s in enumerate(subseq):
  
            num *= dataArray[s][subKeys[j]]
  
            den *= value[s]
  
        numList.append(num)
  
        denList.append(den)
  
        seq = seq[1:]
  
       
    
    resultList = []
  
    for i, num in enumerate(numList):
        #p=log10(num/denList[i])
        #if (p) >=2:
            #print "#########",abs(int(p))
        if (log10(num/denList[i]))>=2:
            #print "i am here"
	    resultList.append(int(abs(1)))
    #print resultList
    #for i in resultList:
	#mean=sum(resultList)/len(resultList)
        #sub=mean-i
        #queue = []
        #queue = (sub)**2
        #print sqrt(queue/len(resultList))
	
    #print mean,"@@@@@@@@@@"
	
        
   
       
  
    outStr = '\n'.join(['Sequence = %s Calculation = %d' % (seqList[i], res) for i, res in enumerate(resultList)])
    #print "this is line 294"
    
  
    return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
    
  
if __name__ == '__main__':
    
  

    fnArray ='C:\\python25\\half.txt'
    import os
    seq_=os.listdir("ding")
    print seq_
    os.chdir("C:\\python25\\New Folder")
    for file_ in seq_:
        if os.path.isfile(file_):
            rem=open(file_)
            dingg=rem.readlines()
    fnSeq = dingg
    
    
  
    outputfile =  "sequence_calc_data.txt"
  
         
  
    arraySet = 1
  
    outList = []
  
    calcdata = 1
  
    while not calcdata is None:
  
        seqSet = 1
  
        while True:
  
            calcdata = compileData(fnArray, fnSeq, arraySet, seqSet)
            print calcdata
  
            if calcdata:
  
                outList.append(calcdata)
  
                seqSet += 1
  
            else:
  
                break
  
        arraySet += 1

        
  
       
  
    f = open(outputfile, 'w')
  
    f.write('\n'.join(outList))
  
    f.close()
    #f=open(outputfile,"r")
    #file_con=f.readlines()
    #for line in file_con:
     #   print line

waiting for ur reply,
cheers!

**bvdet** · Dec 12 '07, 04:17 AM

What error message are you receiving? It appears that you are trying to read data in from the files in a directory. You should be passing the file name to your functions, not the text from one of the files. Try something like this:[code=Python]import os
# Create a list of directory entries
dir_name = 'your_directory '
fList = os.listdir(dir_ name)
# The directory entries do not include the full path
# Create a list of file names with full path, eliminating subdirectory entries
fList1 = [os.path.join(di r_name, f) for f in fList if os.path.isfile( os.path.join(di r_name, f))]
seqSetIndex = 0
fnSeq = fList1[seqSetIndex]
while True:
... do stuff ...
seqSetIndex += 1[/code]'fnSeq' is the file name will full path that is passed to your parsing function.

**aboxylica** · Dec 12 '07, 07:16 AM

to show you typically how my files are.. I am storing them inside python 25 folder..the folder is called New folder which has around five thousand files..so i cant possibly give the names of all the files. for eg YAL001C,YAL002w ..etc
each file has sequence like this:
>Scer_YAL001C SGDID:S0000001 5' untranslated region, Chr I 151168 - 152167 (revcom), 1000 bp
ACTTGTAAATATATC TTTTATTTTCCGAGA GGAAAAAGTTTCAAA AAAAAAAAAAAAAAA AGAAGAAAAATAACT TTCTCATGTAATAAA AGGTAACTAATGTAG ACAAAAAAGTATACA TTTAGCTTTTCTTTT TTTGATGATTTTTGA GTTTCATGTTACTAA TCAGAACAATTAACG ACACCTTCATTATGA AAAAATTAATTAGCT ATAAGTCTTCGAAGT AGAACATGATATTTG GCAATCACTCGAATA ACTATCTTAATTTAC CTGCTGAAATAATTT GAAAAAACACCCGAG GCAGCAGACGAAAGG TGTTTTTGCTAAACA ATGATTGATTTCTGG CGCCATTTCTACATT CTGAACAGTTCATCT CATTTCAGTAACAGT ACTTCAATGGAATAT TTATTAAAGAAAGTG CTTAAAAAAGTATTA TAAAACGATACATGG ACTGACTCAAGATTG AGCTAATAAGGTCCA CCGCCTAGTGCTTAA GAGTTCTGTACCACT ATAATAATTTATCTT GATCGTATTATGTGT AAAAAAAAGGCGCTT GAAATGAAAGCTCCG AAAATTAAAATACTT TGACTGCTTCGGAAA ACAAAAACATATAAA TAAATTTAAAAAATA AACTGTAAAATATTT AAAAACTATTAAAAA TATTTTATATTTTTA AAATTATTTATTATT ATGTCATGTGACAAG ACTTAAATCATTACA TAAAAGGTTTTGAAG TTCAATGTCAAAGTC AATATAATAAGCATA CTAAGGCACACTTAT GCAAATCGAGTTATT GAAGCTGGTAAAATT ATAAGATTTTTATTT TTATTTCTTTTATTT CTGCAAATCTGCATT TTCAAATACCGCTTG GTTTTTTGCATCATA AAGGGCGGCGCTTTC AGTCGCGAAAGTGAA ATAAACAACCAGTCA CACATATAACTTTCT TCTTGCCATAAGAGA GAAGAGGACGTTTGG TTGAAGCCAACTAGC CACAAGAAAA
>Spar YAL001C c218:24375..253 75
TCAAGAGGGTTATTA TATACCGATATTTGA ATCCACACATGATCG AACTAATATAATTCA CTACTTAGCTGCTTA ACCATTCTTTGCCAT TATAATAAATGTATC TCGATCGAATGCTGC ATAGAGAAAAGCGCT TAAAAAAGTGTTCCG AAATATAACATATTT TAAACGTTTCGGAAA CCAAAAACATATATT TATTATCTTAAAGTA ATCTAAAAAATGAAA GAAACTTTGATATAT TTAAAACATAATATA ATTATTTTATAAAAG TAACATGTGATTAAA CTCACAAAGCCTAAA AATGTTTTTAATCAT TATGTTAAGCTGAAT ATAGTATATCTAATA ATTGTTTATTTAAGC AGATTGAACTGTGAA TGCTAGTAAATTTTT AAGGGTTTTTTGTTG TTCTGCTTTTCTCTA GTTTTGTAGTGTCAA ATACAACTAACTGGA TTTTTTGCATCAGAA GGGGCGCTTTAACTC GCGAAAGTAAAAATA AACAATTAATCACAC ATATCTTTTCCTCTT GCAGAAGCAAAGAAG AGGACGGTTGATTGA AGCAAACCAGCCACA GAAAATATGGCGTTG ACAATTTATCCTGAT GAACTGGTTAAAATA GTGTCTGATGAAATT GCATCAAATAAAGGA AGTATGTATATGCCT CATTCTTCTATTCCA TGTTCTTTTCAGGTG AGAAACGTGATATAT TGTAAGATTATTTAC TAACGACTTATTAAA GAAATTACGTTAAAT CAGCTTTGGGATATA TCTCGTAAATATTTT GATTTGTCTGACGAG AAAGTTAAACAATTT GTGCTTTCATGCGTG ATGTTGAAAAAGGAC ATCGAGGTGTACTGT GATAGCGTTATAACA ACTAAAAACGTGACA AATATTATAGACGAC ACTAGTCATTCATAC TCAGTAGGGATTACT GAGGACAGCCTGTGG ACGTTATTAACTGGA TACACAAAGAAGGAG TCAACTATCGGGAAT TCAGCATTTGA

similarly each file has such sequences my code has to go to all the sequences and do it for all the files.
when i run this code , it says

Microsoft Windows XP [Version 5.1.2600]
(C) Copyright 1985-2001 Microsoft Corp.

C:\Python25>pyt hon this_final.py
['sequence_calc_ data.txt', 'YAL001C.fasta' , 'YAL002W.fasta' , 'YAL003W.fasta' , 'Y
AL005C.fasta', 'YAL007C.fasta']
am here
oh yes
oh yes
oh yes
oh yes
oh yes
oh yes
oh yes
oh yes
oh yes
oh yes
oh yes
oh yes
oh yes
oh yes
oh yes
oh yes
oh yes
############### ############### #
We have reached the end of the file!
None
why is this happening.
this is my code

Code:

from math import *
def parseArray(fn, dataset=1, key='PO', term='/'):

    '''

    Read a formatted data file in matrix format and

    compile data into a dictionary

    '''

    f = open(fn)

 

    # skip to required data set

    for _ in range(dataset):
    

        try:

            line = f.next()

            while not line.startswith(key):

                line = f.next()

        except StopIteration, e:

            print 'We have reached the end of the file!'

            f.close()

            return False

 

    headerList = line.strip().split()[1:]
    

    lineList = []

 

    line = f.next().strip()

    while not line.startswith(term):

        if line != '':

            lineList.append(line.strip().split())


        line = f.next().strip()

 

    f.close()

 

    # Key list

    keys = [i[0] for i in lineList]

    # Values list

    values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]

 

    # Create a dictionary from keys and values

    lineDict = dict(zip(keys, values))

 

    dataDict = {}

 

    for i, item in enumerate(headerList):

        dataDict[item] = {}

        for key in lineDict:

            dataDict[item][key] = lineDict[key][i]

 

    # Add 1.0 to every element in dataDict subdictionaries

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:

            dataDict[keyMain][keySub] += 1.0

 

    # Normalize original data (with 1 added) and update data

    valueSums = [sum(item)+4 for item in values]

    # print valueSums

 

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:
            dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]

    return dataDict

 

 

def parseData(fn, dataset=1, key='>'):

    '''

    Read a formatted data file of sequences

    Return a list of sequences

    The first element in the list is the header

    '''   

    # initialize output list

    dataList = []

   

    # open file for reading

    f = open(fn)

   

    # skip to required data set

    for _ in range(dataset):


        try:

            s = f.next()

            while not s.startswith(key):
            

                s = f.next()

        except StopIteration, e:

            print 'We have reached the end of the file!'

            f.close()

            return False

 

    # initialize output list

    dataList = [s,]

       
    for line in f:

        if not line.startswith(key):

            dataList.append(line.strip())

        else:

            break

 

    f.close()

    return dataList


 

   
def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
   
    # sequence factor dictionary
   
    value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
   
         
   
    dataArray = parseArray(fnArray, arraySet)

   
    if dataArray:

        dataSeq = parseData(fnSeq, seqSet)

   
        if not dataSeq:
   
            return False
  
    else:
  
        return None
    
  
         
  
    # This is the complete sequence 
  
    seq = ''.join(dataSeq[1:])

    
    


    # These are the subkeys of dataArray - '01', '02', '03',.............
  
    subKeys = dataArray['A'].keys()

    subKeys.sort()

    
  
       
  
    # Calculate num/den for each slice of sequence
  
    # Each sequence slice length = length of subKeys
  
    # Example:
    # seq = 'ATCGATA'
  
    # subKeys length = 3
  
    # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'

    numList = []
  
    denList = []
  
    seqList = []
  
    for i in xrange(len(seq) - len(subKeys)):
  
        subseq = seq[0:len(subKeys)]
  
        seqList.append(subseq)

  
        num, den = 1, 1
  
        for j, s in enumerate(subseq):
  
            num *= dataArray[s][subKeys[j]]
  
            den *= value[s]
  
        numList.append(num)
  
        denList.append(den)
  
        seq = seq[1:]
  
       
    
    resultList = []
  
    for i, num in enumerate(numList):
        #p=log10(num/denList[i])
        #if (p) >=2:
            #print "#########",abs(int(p))
        if (log10(num/denList[i]))>=2:
            #print "i am here"
	    resultList.append(int(abs(1)))
    #print resultList
    #for i in resultList:
	#mean=sum(resultList)/len(resultList)
        #sub=mean-i
        #queue = []
        #queue = (sub)**2
        #print sqrt(queue/len(resultList))
	
    #print mean,"@@@@@@@@@@"
	
        
   
       
  
    outStr = '\n'.join(['Sequence = %s Calculation = %d' % (seqList[i], res) for i, res in enumerate(resultList)])
    #print "this is line 294"
    
  
    return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
    
  
if __name__ == '__main__':
  

    fnArray ='C:\python25\half.txt'
    fnSeq = 'C:\python25\New Folder'
    import os
    dir_name='New Folder'
    fList=os.listdir(dir_name)
    fList1=[os.path.join(dir_name,f) for f in fList if
    os.path.isfile(os.path.join(dir_name,f))]
    seqSetIndex=0
    fnSeq=fList1[seqSetIndex]
    while True:
        seqSetIndex+=1
    
    
  
    outputfile =  "sequence_calc_data.txt"
  
         
  
    arraySet = 1
  
    outList = []
  
    calcdata = 1
  
    while not calcdata is None:
  
        seqSet = 1
  
        while True:
  
            calcdata = compileData(fnArray, fnSeq, arraySet, seqSet)
            print calcdata
  
            if calcdata:
  
                outList.append(calcdata)
  
                seqSet += 1
  
            else:
  
                break
  
        arraySet += 1

        
  
       
  
    f = open(outputfile, 'w')
  
    f.write('\n'.join(outList))
  
    f.close()
    #f=open(outputfile,"r")
    #file_con=f.readlines()
    #for line in file_con:
     #   print line

**aboxylica** · Dec 12 '07, 08:00 AM

when i run the code below.. its going to an infinite loop which says "reached the end of file"

Code:

from math import *

def parseArray(fn, dataset=1, key='PO', term='/'):

    '''

    Read a formatted data file in matrix format and

    compile data into a dictionary

    '''

    f = open(fn)

 

    # skip to required data set

    for _ in range(dataset):
    

        try:

            line = f.next()

            while not line.startswith(key):

                line = f.next()

        except StopIteration, e:

            print 'We have reached the end of the file!'

            f.close()

            return False

 

    headerList = line.strip().split()[1:]
    

    lineList = []

 

    line = f.next().strip()

    while not line.startswith(term):

        if line != '':

            lineList.append(line.strip().split())


        line = f.next().strip()

 

    f.close()

 

    # Key list

    keys = [i[0] for i in lineList]

    # Values list

    values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]

 

    # Create a dictionary from keys and values

    lineDict = dict(zip(keys, values))

 

    dataDict = {}

 

    for i, item in enumerate(headerList):

        dataDict[item] = {}

        for key in lineDict:

            dataDict[item][key] = lineDict[key][i]

 

    # Add 1.0 to every element in dataDict subdictionaries

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:

            dataDict[keyMain][keySub] += 1.0

 

    # Normalize original data (with 1 added) and update data

    valueSums = [sum(item)+4 for item in values]

    # print valueSums

 

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:
            dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]

    return dataDict

 

 

def parseData(fn, dataset=1, key='>'):

    '''

    Read a formatted data file of sequences

    Return a list of sequences

    The first element in the list is the header

    '''   

    # initialize output list

    dataList = []

   

    # open file for reading

    f = open(fn)

   

    # skip to required data set

    for _ in range(dataset):


        try:

            s = f.next()

            while not s.startswith(key):
            

                s = f.next()

        except StopIteration, e:

            print 'We have reached the end of the file!'

            f.close()

            return False

 

    # initialize output list

    dataList = [s,]

       
    for line in f:

        if not line.startswith(key):

            dataList.append(line.strip())

        else:

            break

 

    f.close()

    return dataList


 

   
def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
   
    # sequence factor dictionary
   
    value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
   
         
   
    dataArray = parseArray(fnArray, arraySet)

   
    if dataArray:

        dataSeq = parseData(fnSeq, seqSet)

   
        if not dataSeq:
   
            return False
  
    else:
  
        return None
    
  
         
  
    # This is the complete sequence 
  
    seq = ''.join(dataSeq[1:])

    
    


    # These are the subkeys of dataArray - '01', '02', '03',.............
  
    subKeys = dataArray['A'].keys()

    subKeys.sort()

    
  
       
  
    # Calculate num/den for each slice of sequence
  
    # Each sequence slice length = length of subKeys
  
    # Example:
    # seq = 'ATCGATA'
  
    # subKeys length = 3
  
    # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'

    numList = []
  
    denList = []
  
    seqList = []
  
    for i in xrange(len(seq) - len(subKeys)):
  
        subseq = seq[0:len(subKeys)]
  
        seqList.append(subseq)

  
        num, den = 1, 1
  
        for j, s in enumerate(subseq):
  
            num *= dataArray[s][subKeys[j]]
  
            den *= value[s]
  
        numList.append(num)
  
        denList.append(den)
  
        seq = seq[1:]
  
       
    
    resultList = []
  
    for i, num in enumerate(numList):
        #p=log10(num/denList[i])
        #if (p) >=2:
            #print "#########",abs(int(p))
        if (log10(num/denList[i]))>=2:
            #print "i am here"
	    resultList.append(int(abs(1)))
    #print resultList
    #for i in resultList:
	#mean=sum(resultList)/len(resultList)
        #sub=mean-i
        #queue = []
        #queue = (sub)**2
        #print sqrt(queue/len(resultList))
	
    #print mean,"@@@@@@@@@@"
	
        
   
       
  
    outStr = '\n'.join(['Sequence = %s Calculation = %d' % (seqList[i], res) for i, res in enumerate(resultList)])
    #print "this is line 294"
    
  
    return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
    
  
if __name__ == '__main__':
  

    fnArray ='C:\python25\half.txt'
    fnSeq = 'C:\python25\New Folder'
    import os
    dir_name='New Folder'
    fList=os.listdir(dir_name)
    fList1=[os.path.join(dir_name,f) for f in fList if
    os.path.isfile(os.path.join(dir_name,f))]
    seqSetIndex=0
    fnSeq=fList1[seqSetIndex]
    while True:
        
    
    
  
        outputfile =  "sequence_calc_data.txt"
  
         
  
        arraySet = 1
  
        outList = []
  
        calcdata = 1
  
        while not calcdata is None:
  
            seqSet = 1
  
            while True:
  
                calcdata = compileData(fnArray, fnSeq, arraySet, seqSet)
                print calcdata
  
                if calcdata:
  
                    outList.append(calcdata)
  
                    seqSet += 1
  
                else:
  
                    break
  
        arraySet += 1
        seqSetIndex+=1

        
  
       
  
    f = open(outputfile, 'w')
  
    f.write('\n'.join(outList))
  
    f.close()
    #f=open(outputfile,"r")
    #file_con=f.readlines()
    #for line in file_con:
     #   print line

**bvdet** · Dec 12 '07, 02:58 PM

You have two while loops. When calcData() returns None (when the end of file is reached), you issue one break statement. You may need another to get out of the other loop.

**aboxylica** · Dec 12 '07, 06:28 PM

this is my code now.. i have added a break statement and still there seems to be some prob. it is coming out of the loop and saying
we have reached the end of file
None
here is the code

Code:

from math import *
def parseArray(fn, dataset=1, key='PO', term='/'):

    '''

    Read a formatted data file in matrix format and

    compile data into a dictionary

    '''

    f = open(fn)

 

    # skip to required data set

    for _ in range(dataset):
    

        try:

            line = f.next()

            while not line.startswith(key):

                line = f.next()

        except StopIteration, e:

            print 'We have reached the end of the file!'

            f.close()

            return False

 

    headerList = line.strip().split()[1:]
    

    lineList = []

 

    line = f.next().strip()

    while not line.startswith(term):

        if line != '':

            lineList.append(line.strip().split())


        line = f.next().strip()

 

    f.close()

 

    # Key list

    keys = [i[0] for i in lineList]

    # Values list

    values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]

 

    # Create a dictionary from keys and values

    lineDict = dict(zip(keys, values))

 

    dataDict = {}

 

    for i, item in enumerate(headerList):

        dataDict[item] = {}

        for key in lineDict:

            dataDict[item][key] = lineDict[key][i]

 

    # Add 1.0 to every element in dataDict subdictionaries

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:

            dataDict[keyMain][keySub] += 1.0

 

    # Normalize original data (with 1 added) and update data

    valueSums = [sum(item)+4 for item in values]

    # print valueSums

 

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:
            dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]

    return dataDict

 

 

def parseData(fnSeq, dataset=1, key='>'):

    '''

    Read a formatted data file of sequences

    Return a list of sequences

    The first element in the list is the header

    '''   

    # initialize output list

    dataList = []

   

    # open file for reading

    f = open(fn)

   

    # skip to required data set

    for _ in range(dataset):


        try:

            s = f.next()

            while not s.startswith(key):
            

                s = f.next()

        except StopIteration, e:

            print 'We have reached the end of the file!'

            f.close()

            return False

 

    # initialize output list

    dataList = [s,]

       
    for line in f:

        if not line.startswith(key):

            dataList.append(line.strip())

        else:

            break

 

    f.close()

    return dataList


 

   
def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
   
    # sequence factor dictionary
   
    value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
   
         
   
    dataArray = parseArray(fnArray, arraySet)

   
    if dataArray:

        dataSeq = parseData(fnSeq, seqSet)

   
        if not dataSeq:
   
            return False
  
    else:
  
        return None
    
  
         
  
    # This is the complete sequence 
  
    seq = ''.join(dataSeq[1:])

    
    


    # These are the subkeys of dataArray - '01', '02', '03',.............
  
    subKeys = dataArray['A'].keys()

    subKeys.sort()

    
  
       
  
    # Calculate num/den for each slice of sequence
  
    # Each sequence slice length = length of subKeys
  
    # Example:
    # seq = 'ATCGATA'
  
    # subKeys length = 3
  
    # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'

    numList = []
  
    denList = []
  
    seqList = []
  
    for i in xrange(len(seq) - len(subKeys)):
  
        subseq = seq[0:len(subKeys)]
  
        seqList.append(subseq)

  
        num, den = 1, 1
  
        for j, s in enumerate(subseq):
  
            num *= dataArray[s][subKeys[j]]
  
            den *= value[s]
  
        numList.append(num)
  
        denList.append(den)
  
        seq = seq[1:]
  
       
    
    resultList = []
  
    for i, num in enumerate(numList):
        #p=log10(num/denList[i])
        #if (p) >=2:
            #print "#########",abs(int(p))
        #if (log10(num/denList[i]))>=2:
            #print "i am here"
	    resultList.append(int(abs(1)))
    #print resultList
    #for i in resultList:
	#mean=sum(resultList)/len(resultList)
        #sub=mean-i
        #queue = []
        #queue = (sub)**2
        #print sqrt(queue/len(resultList))
	
    #print mean,"@@@@@@@@@@"
	
        
   
       
  
    outStr = '\n'.join(['Sequence = %s Calculation = %d' % (seqList[i], res) for i, res in enumerate(resultList)])
    #print "this is line 294"
    
  
    return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
    
  
if __name__ == '__main__':
  

    fnArray ='C:\python25\half.txt'
    fnSeq = 'C:\python25\New Folder'
    import os
    dir_name='New Folder'
    fList=os.listdir(dir_name)
    fList1=[os.path.join(dir_name,f) for f in fList if os.path.isfile(os.path.join(dir_name,f))]
    seqSetIndex=0
    fnSeq=fList1[seqSetIndex]
    while True:
        
    
    
  
        outputfile =  "sequence_calc_data.txt"
  
         
  
        arraySet = 1
  
        outList = []
  
        calcdata = 1
  
        while not calcdata is None:
  
            seqSet = 1
  
            while True:
  
                calcdata = compileData(fnArray, fnSeq, arraySet, seqSet)
                print calcdata
  
                if calcdata:
  
                    outList.append(calcdata)
  
                    seqSet += 1
  
                else:
  
                    break
  
            arraySet += 1
            seqSetIndex+=1
        else:
            break
    

        
  
       
  
    f = open(outputfile, 'w')
  
    f.write('\n'.join(outList))
  
    f.close()
    #f=open(outputfile,"r")
    #file_con=f.readlines()
    #for line in file_con:
     #   print line

looping through a big file containing a set of files.

Comment

Comment

Comment

Comment

Comment

Comment

Comment

Comment

Comment

Comment

Comment

Comment

Comment

Comment

Comment