looping through a big file containing a set of files.

**aboxylica** · Dec 13 '07, 05:53 AM

hey!
That code seens to work but the problem is it is checking only for the first six files... i donno where is the looping problem!!
can you see it??

Code:

from math import *
def parseArray(fn, dataset=1, key='PO', term='/'):

    '''

    Read a formatted data file in matrix format and

    compile data into a dictionary

    '''

    f = open(fn)

 

    # skip to required data set

    for _ in range(dataset):
    

        try:

            line = f.next()

            while not line.startswith(key):

                line = f.next()

        except StopIteration, e:

            print 'We have reached the end of the file!'

            f.close()

            return False

 

    headerList = line.strip().split()[1:]
    

    lineList = []

 

    line = f.next().strip()

    while not line.startswith(term):

        if line != '':

            lineList.append(line.strip().split())


        line = f.next().strip()

 

    f.close()

 

    # Key list

    keys = [i[0] for i in lineList]

    # Values list

    values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]

 

    # Create a dictionary from keys and values

    lineDict = dict(zip(keys, values))

 

    dataDict = {}

 

    for i, item in enumerate(headerList):

        dataDict[item] = {}

        for key in lineDict:

            dataDict[item][key] = lineDict[key][i]

 

    # Add 1.0 to every element in dataDict subdictionaries

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:

            dataDict[keyMain][keySub] += 1.0

 

    # Normalize original data (with 1 added) and update data

    valueSums = [sum(item)+4 for item in values]

    # print valueSums

 

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:
            dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]

    return dataDict

 

 

def parseData(fnSeq, dataset=1, key='>'):

    '''

    Read a formatted data file of sequences

    Return a list of sequences

    The first element in the list is the header

    '''   

    # initialize output list

    dataList = []

   

    # open file for reading

    f = open(fnSeq)

   

    # skip to required data set

    for _ in range(dataset):


        try:

            s = f.next()

            while not s.startswith(key):
            

                s = f.next()

        except StopIteration, e:

            print 'We have reached the end of the file!'

            f.close()

            return False

 

    # initialize output list

    dataList = [s,]

       
    for line in f:

        if not line.startswith(key):

            dataList.append(line.strip())

        else:

            break

 

    f.close()

    return dataList


 

   
def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
   
    # sequence factor dictionary
   
    value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
   
         
   
    dataArray = parseArray(fnArray, arraySet)

   
    if dataArray:

        dataSeq = parseData(fnSeq, seqSet)

   
        if not dataSeq:
   
            return False
  
    else:
  
        return None
    
  
         
  
    # This is the complete sequence 
  
    seq = ''.join(dataSeq[1:])

    
    


    # These are the subkeys of dataArray - '01', '02', '03',.............
  
    subKeys = dataArray['A'].keys()

    subKeys.sort()

    
  
       
  
    # Calculate num/den for each slice of sequence
  
    # Each sequence slice length = length of subKeys
  
    # Example:
    # seq = 'ATCGATA'
  
    # subKeys length = 3
  
    # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'

    numList = []
  
    denList = []
  
    seqList = []
  
    for i in xrange(len(seq) - len(subKeys)):
  
        subseq = seq[0:len(subKeys)]
  
        seqList.append(subseq)

  
        num, den = 1, 1
  
        for j, s in enumerate(subseq):
  
            num *= dataArray[s][subKeys[j]]
  
            den *= value[s]
  
        numList.append(num)
  
        denList.append(den)
  
        seq = seq[1:]
  
       
    
    resultList = []
  
    for i, num in enumerate(numList):
        #p=log10(num/denList[i])
        #if (p) >=2:
            #print "#########",abs(int(p))
        #if (log10(num/denList[i]))>=2:
            #print "i am here"
	    resultList.append(int(abs(1)))
    #print resultList
    #for i in resultList:
	#mean=sum(resultList)/len(resultList)
        #sub=mean-i
        #queue = []
        #queue = (sub)**2
        #print sqrt(queue/len(resultList))
	
    #print mean,"@@@@@@@@@@"
	
        
   
       
  
    outStr = '\n'.join(['Sequence = %s Calculation = %d' % (seqList[i], res) for i, res in enumerate(resultList)])
    #print "this is line 294"
    
  
    return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
    
  
if __name__ == '__main__':
  

    fnArray ='one_redfly.transfac'
    fnSeq = 'deepthi/upstream_regions'
    import os
    dir_name='upstream_regions'
    fList=os.listdir(dir_name)
    fList1=[os.path.join(dir_name,f) for f in fList if os.path.isfile(os.path.join(dir_name,f))]
    seqSetIndex=0
    fnSeq=fList1[seqSetIndex]

    while True:
        
    
    
  
        outputfile =  "sequence_calc_data.txt"
  
         
  
        arraySet = 1
  
        outList = []
  
        calcdata = 1
  
        while not calcdata is None:
  
            seqSet = 1
  
            while True:
  
                calcdata = compileData(fnArray, fnSeq, arraySet, seqSet)
                print calcdata
  
                if calcdata:
  
                    outList.append(calcdata)
  
                    seqSet += 1
  
                else:
  
                    break
  
            arraySet += 1
            seqSetIndex+=1
        else:
            break
    

        
  
       
  
    f = open(outputfile, 'w')
  
    f.write('\n'.join(outList))
  
    f.close()
    #f=open(outputfile,"r")
    #file_con=f.readlines()
    #for line in file_con:
     #   print line

waiting for ur reply!
cheers!!

**aboxylica** · Dec 13 '07, 06:41 AM

when the seqset is becoming 6.. it is coming out of the loop..why is this happening???

**bvdet** · Dec 13 '07, 03:33 PM

This code is untested, but I think is close to what you are trying to do:[code=Python]if __name__ == '__main__':
import os

fnArray = 'array_file'
outputfile = 'output_file'
masterList = []

# user has multiple sequence files
# and must iterate over all the files
dir_name = 'your_directory '
fileList = [os.path.join(di r_name,f) for f in os.listdir(dir_ name)\
if os.path.isfile( os.path.join(di r_name,f))]

for fnSeq in fileList:
outList = []
calcdata = 1
while not calcdata is None:
seqSet = 1
while True:
calcdata = compileData(fnA rray, fnSeq, arraySet, seqSet)
if calcdata:
outList.append( calcdata)
seqSet += 1
else:
break
arraySet += 1
masterList.appe nd('\n'.join(ou tList))
f = open(outputfile , 'w')
f.write('\n'.jo in(masterList))
f.close()
[/code]

**aboxylica** · Dec 13 '07, 07:45 PM

i donno the code is not executing.. sequence set is not at all incrementing..
when i print seqSet.. it is always one..it is not incrementing and the resultlist is empty.
i have been trying.. there is some problem in writing it to a file also..

Code:

from math import *
def parseArray(fn, dataset=1, key='PO', term='/'):

    '''

    Read a formatted data file in matrix format and

    compile data into a dictionary

    '''

    f = open(fn)

 

    # skip to required data set

    for _ in range(dataset):
    

        try:

            line = f.next()

            while not line.startswith(key):

                line = f.next()

        except StopIteration, e:

            #print 'We have reached the end of the file!'

            f.close()

            return False

 

    headerList = line.strip().split()[1:]
    

    lineList = []

 

    line = f.next().strip()

    while not line.startswith(term):

        if line != '':

            lineList.append(line.strip().split())


        line = f.next().strip()

 

    f.close()

 

    # Key list

    keys = [i[0] for i in lineList]

    # Values list

    values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]

 

    # Create a dictionary from keys and values

    lineDict = dict(zip(keys, values))

 

    dataDict = {}

 

    for i, item in enumerate(headerList):

        dataDict[item] = {}

        for key in lineDict:

            dataDict[item][key] = lineDict[key][i]

 

    # Add 1.0 to every element in dataDict subdictionaries

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:

            dataDict[keyMain][keySub] += 1.0

 

    # Normalize original data (with 1 added) and update data

    valueSums = [sum(item)+4 for item in values]

    # print valueSums

 

    for keyMain in dataDict:

        for keySub in dataDict[keyMain]:
            dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]

    return dataDict

 

 

def parseData(fnSeq, dataset=1, key='>'):

    '''

    Read a formatted data file of sequences

    Return a list of sequences

    The first element in the list is the header

    '''   

    # initialize output list

    dataList = []

   

    # open file for reading

    f = open(fn)

   

    # skip to required data set

    for _ in range(dataset):


        try:

            s = f.next()

            while not s.startswith(key):
            

                s = f.next()

        except StopIteration, e:

            #print 'We have reached the end of the file!'

            f.close()

            return False

 

    # initialize output list

    dataList = [s,]

       
    for line in f:

        if not line.startswith(key):

            dataList.append(line.strip())

        else:

            break

 

    f.close()

    return dataList


 

   
def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
   
    # sequence factor dictionary
   
    value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
   
         
   
    dataArray = parseArray(fnArray, arraySet)
    print dataArray

   
    if dataArray:

        dataSeq = parseData(fnSeq, seqSet)
        #print dataSeq,"@@@@@@@@@@@@@@@@@@@"

   
        if not dataSeq:
   
            return False
  
    else:
  
        return None
    
  
         
  
    # This is the complete sequence 
  
    seq = ''.join(dataSeq[1:])

    
    


    # These are the subkeys of dataArray - '01', '02', '03',.............
  
    subKeys = dataArray['A'].keys()

    subKeys.sort()

    
  
       
  
    # Calculate num/den for each slice of sequence
  
    # Each sequence slice length = length of subKeys
  
    # Example:
    # seq = 'ATCGATA'
  
    # subKeys length = 3
  
    # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'

    numList = []
  
    denList = []
  
    seqList = []
  
    for i in xrange(len(seq) - len(subKeys)):
  
        subseq = seq[0:len(subKeys)]
  
        seqList.append(subseq)

  
        num, den = 1, 1
  
        for j, s in enumerate(subseq):
  
            num *= dataArray[s][subKeys[j]]
  
            den *= value[s]
  
        numList.append(num)
  
        denList.append(den)
  
        seq = seq[1:]
  
       
    
    resultList = []
  
    for i, num in enumerate(numList):
        #p=log10(num/denList[i])
        #if (p) >=2:
            #print "#########",abs(int(p))
        if (log10(num/denList[i]))>=2:
            #print "i am here"
	    resultList.append(int(abs(1)))
    #print resultList
    #for i in resultList:
	#mean=sum(resultList)/len(resultList)
        #sub=mean-i
        #queue = []
        #queue = (sub)**2
        #print sqrt(queue/len(resultList))
	
    #print mean,"@@@@@@@@@@"
	
        
   
       
  
    outStr = '\n'.join(['Sequence = %s Calculation = %d' % (seqList[i], res) for i, res in enumerate(resultList)])
    #print "this is line 294"
    
  
    return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
    
  
if __name__ == '__main__':
    import os
    
  

    fnArray ='half.txt'
    outputfile='output_file'
    masterList=[]
    dir_name='New Folder'
    fileList=[os.path.join(dir_name,f) for f in os.listdir(dir_name) if os.path.isfile(os.path.join(dir_name,f))]
    
    
    
    for fnSeq in fileList:
        outList=[]
        calcdata=1
        arraySet=1
        
        while not calcdata is None:
            
            seqSet=1
            while True:
                print "am here"
                
                calcdata=compileData(fnArray,fnSeq,arraySet,seqSet)
                
                if calcdata:
                    outList.append(calcdata)
                    #print outList
                    seqSet += 1
                    print seqSet,""################"
                else:
                    break
            arraySet+=1
        masterList.append('\n'.join(outList))
    #f=open(outputfile,'W')
    #fwrite('\n'.join(masterList))
    #f.close()

waiting for ur reply
cheers!

**bvdet** · Dec 13 '07, 08:34 PM

There is no need to post the entire code repeatedly. Presumably the code before "if __name__ == '__main__':" works. I assume that you have one array file and multiple sequence files. You want to iterate on each array set in the array file on each sequence set for each sequence file in a separate directory. The following code works for me given the previous assumptions:[code=Python]if __name__ == '__main__':
import os

fnArray = r'H:\array.txt'
outputfile = r'H:\seq_array_ output.txt'
masterList = []

# user has multiple sequence files
# and must iterate over all the files
dir_name = r'H:\TEMP\seque nce_files
fileList = [os.path.join(di r_name,f) for f in os.listdir(dir_ name)\
if os.path.isfile( os.path.join(di r_name,f))]

for fnSeq in fileList:
outList = []
calcdata = 1
arraySet = 1
while not calcdata is None:
seqSet = 1
while True:
calcdata = compileData(fnA rray, fnSeq, arraySet, seqSet)
if calcdata:
outList.append( calcdata)
seqSet += 1
else:
break
arraySet += 1
masterList.appe nd('\n'.join(ou tList))
f = open(outputfile , 'w')
f.write('\n'.jo in(masterList))
f.close() [/code]Following is a snippet from the array file:

NA bin
PO A C G T
01 0.45 8.27 0.00 11.39
02 0.00 0.00 10.02 10.09
03 5.80 1.39 0.00 12.93
//
//
NA bap
PO A C G T
01 0.00 3.67 0.00 0.00
02 0.00 0.00 3.67 0.00
03 0.00 0.00 0.00 3.67
04 0.00 3.67 0.00 0.00
//
//
NA bcd
PO A C G T
01 42.55 8.75 145.86 8.14
02 0.14 0.53 204.64 0.00
03 126.83 78.02 0.11 0.34
04 0.21 0.17 0.00 204.92
05 0.00 12.38 0.43 192.50
06 174.48 0.95 1.32 28.56
07 79.53 4.70 100.44 20.64
//
//

Following is a snippet from one of the sequence files:

>CG9571_O-E|Drosophila melanogaster|CG 9571|FBgn003108 6|X:19926374..1 9927133
CCAGTCCACCGGCCG CCGATCTATTTATAC GAGAGGAAGAGGCTG AACTCGAGGATTACC CGTGTATCCTGGGAC GCG
GATTAGCGATCCATT CCCCTTTTAATCGCC GCGCAAACAGATTCA TGAAAGCCTTCGGAT TCATTCATTGATCCA CAT
CGTTTTTCTGTTATT AATAAAAAACCAATA GGAAAGTTCTCAAAA ATTACTCTGTTGTAT TTGATCATTTCTTTT CCG
GTATAATCTTTTATT TTAAGCATTCCCATG TGAATAAATTTCAGA CTAATGTATTAATAA GATGTCGTGTTTTTC CAC
TTACAAATTTCTCAT ACAGCTGGATATATA CTACGAGTACTATAC ACATGCTCTGGG
>Cp36_DRR|Droso phila melanogaster|Cp 36|FBgn0000359| X:8323349..8324 136
AGTCGACCAGCACGA GATCTCACCTACCTT CTTTATAAGCGGGGT CTCTAGAAGCTAAAT CCATGTCCACGTCAA ACC
AAAGACTTGCGGTCT CCAGACCATTGAGTT CTATAAATGGGACTG AGCCACACCATACAC CACACACCACACATA CAC
AACATTG
>Cp36_PRR|Droso phila melanogaster|Cp 36|FBgn0000359| X:8324430..8324 513
TCTAGAGATCTGGGC ACGATGGCGAGACAA AGATGCGGCGCAAAA TCGGAAATGGAGATG GATCACGTAGCCGGC CAT
GGCGG
>Him_distal|Dro sophila melanogaster|Hi m|FBgn0030900|X :18039896..1804 3470
GGTTTTCTGCGATGG CTTCCGCGCCAGCTG AAGTATCTGATTTGC TGCCTTGTTTTTGTT GATATTTCTGCGAAG GGA
CTTGTGCTTTTCAAA TGGCCTTTTTTTGGG ATTACGGCAAGGGCG CGTTTCCCACGCTCG ATCCCCACTTACCAT TGG
TGCACGCGATTGCGG CAAGCTGCTGAGGCA AGCTATTAAACGCCA CACTGGGCCGGGGGG CGGTACCGGTGGGCG TGG
AATATATAGATCGGA GATATCGCAGGACCC ACAGCAGAGCAGAGC CGCAGAGCCACCAAC CTCG

I don't know what else I can do! Maybe you should check if your defined functions are working as expected.

**aboxylica** · Dec 14 '07, 07:52 AM

I sort of took away the spaces and tried executing.. the other functions are correct. please execute this code with the inputs i paste now and please paste ur o/p file
my directory containing the sequence files is called "up"

this contains 24 files.. but let me paste the first two files.
first file called YAL012w contains the sequence
>Scer_YAL012W SGDID:S0000010 5' untranslated region, Chr I 130534 - 130801, 268 bp
ATATGTTGTAACGCT ATGTAATTCCACCCT TCATTACTAATAATT AGCCATTCACGTGAT CTCAGCCAGTTGTGG CGCCACACTTTTTTT TCCATAAAAATCCTC GAGGAAAAGAAAAGA AAAAAATATTTCAGT TATTTAAAGCATAAG ATGCCAGGTAGATGG AACTTGTGCCGTGCC AGATTGAATTTTGAA AGTACAATTGAGGCC TATACACATAGACAT TTGCACCTTATACAT ATACACACAAGACAA AACCAAAAAAAAT
>Smik YAL012W c875:991..1258
GCCACACCGTCTCTA CACAGCTTCTTGTTC TTGTTATTCCTAAAT ATGTTGTAACGCTAC GTAATTCCACCTTTC ATTACTAATATTAAG CCATTCACGTGACTT TTTCCCAGCGCCTGG TGTGGCGCCACACTT TTTTTTCATAAGAAT CCTCGAGGAAAACAG AAAAATATTTCAGTT ATTTAAACTATAAGA TGCTTGGGGATATAA TGATGGTGCCGTGTG TTTGTGTTTGAGTGT GTTTGAGTGTTCTGA AAGAGAGAGC
>Spar YAL012W c218:45412..456 79 (rev)
TTCTTAAATATGTTG TAACGCTATGTAATT CCACCCTTCATTACT AATAATTAGCCATTC ACGTGATACCAGCCA GCTGTGGCGCCACAC TTTTTTTCCATAAAA ATCCTCGAGGAAAAG AAAAGAAAAAATATT TCAGTTATTTAAACC AAAAGATGCCTGGGA GATAGAGCTGGTGCC GTGTGAAATCCAGTT TTGAAAGTGCAATTG AGTCCTACACACATA AGCATCCGCACCTTG TATACATATACGCCA AACAAAACAAAAA
and the second file is called YAl013 and contains the following sequence
>Scer_YAL013W SGDID:S0000011 5' untranslated region, Chr I 129021 - 129271, 251 bp
TACCGTGTGTGCTTA CGCATCTATTTGCTG TCGTGAGATCTGTCT CTATGCTTTATTCGT TTTCCATTGTAAAGT CTCAGCATTTATTTC TTGTTCTTTACTTGT TTTTGCCCTTTCGGG TGATCAAAGTCGTGC TGGGAAATTTTATTC TTATAAAATGATTTT TAGAAATAATAAACT CATAACAGTGCAACG GCAAAGTACAAGGGA AGGAAGCACAGAAGC AAGAGGAGGCGCATC GATCGTGGCAG
>Sbay YAL013W c669:46323..465 74 (rev)
ACGTCCATCCCGTGT GTGCTTGCACATCTG CTTGGCCTGCGAGCC GTCCTCTGGGCCATC CGCTCCCTATTGTGA AGTACCAGCCTTTGT TCCTGCTCTTCATAG TTTGCTCTTTCGGGT GATGAAAGTTGGGCT GGAAATTTTATCCCA AAGGACAATATGAAA ATAAACTCGAAACTG TGTAGCAACCAAGAG GGAAGGGGAAAGGGA GAGACAGGAGCAAGA GGAAGCATCGGTCGT GGCAGATGAGTC
>Spar YAL013W c218:46936..471 86 (rev)
ATACCGTGTGTGCTT ACGCATCTATTTGTT CTCGTGAGCCCCGTC TCTGGACTTTATCCG TTCGCTATTGTAAAG TCTTAGCCTTTATCT CTAGTTCTTTACCTG CTTTTGCTCTTTCGG GTGATGAAAGTTGCG CTGGGAAATTTTATT CCCATAGAATGATTT TTAGAAATAATAAAC TCATAACAGTGCAAC GACCAAGTGCAGGGG AAAGTAGCATAGAAG CAAGAGGAGACGCAT CGATCGCGGCA

the matrix file am using is called weight matrix.transfac .txt
//
NA Abd-B
PO A C G T
01 10.19 0.00 10.65 6.24
02 5.79 0.67 10.50 10.11
03 4.50 0.00 0.00 22.57
04 0.00 0.00 0.00 27.08
05 0.00 0.00 0.00 27.08
06 0.00 0.00 0.00 27.08
07 27.08 0.00 0.00 0.00
08 0.00 2.83 0.00 24.25
09 0.00 0.00 24.45 2.62
10 19.33 0.00 4.34 3.41
11 0.31 12.28 3.39 11.09
//
//
NA Adf1
PO A C G T
01 0.71 0.08 26.02 1.55
02 3.03 23.00 1.24 1.09
03 0.26 10.50 3.29 14.31
04 0.00 0.06 28.23 0.07
05 0.12 27.27 0.06 0.91
06 1.44 20.36 0.37 6.19
07 5.35 0.28 21.49 1.24
08 7.81 16.10 3.81 0.63
09 0.51 17.77 0.45 9.63
10 0.00 0.14 28.21 0.00
11 0.00 25.69 0.20 2.46
12 0.48 9.98 0.07 17.82
13 1.27 0.00 27.01 0.07
14 15.59 7.98 2.92 1.87
15 4.28 22.37 0.00 1.70
16 0.18 0.77 22.70 4.70
//
//
NA Aef1
PO A C G T
01 0.00 0.06 12.49 0.00
02 3.80 0.17 0.00 8.57
03 0.87 0.06 0.00 11.62
04 0.06 9.76 2.32 0.41
05 9.82 0.00 2.73 0.00
06 9.76 0.00 0.00 2.78
07 3.80 0.31 0.00 8.43
08 0.00 0.00 0.00 12.54
09 0.00 6.53 5.85 0.17
10 0.00 12.38 0.17 0.00
11 2.73 1.02 8.80 0.00
12 5.85 0.00 6.70 0.00
13 1.02 5.96 0.00 5.57
14 0.00 5.16 4.66 2.73
15 1.03 7.55 3.97 0.00
16 4.82 5.00 2.73 0.00
//
//

**aboxylica** · Dec 14 '07, 07:53 AM

and here is the code with spaces removed!

Code:

from math import *
print "hellllllllo"
def parseArray(fn, dataset=1, key='PO', term='/'):

    '''

    Read a formatted data file in matrix format and

    compile data into a dictionary

    '''
    f = open(fn)
    # skip to required data set
    for _ in range(dataset):
        try:
            line = f.next()
            while not line.startswith(key):
                line = f.next()
        except StopIteration, e:
            print 'We have reached the end of the file!','matrix file'
            f.close()
            return False
    headerList = line.strip().split()[1:]
    lineList = []
    line = f.next().strip()
    while not line.startswith(term):
        if line != '':
            lineList.append(line.strip().split())
    line = f.next().strip()
    f.close()
    # Key list
    keys = [i[0] for i in lineList]
    # Values list
    values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]
    # Create a dictionary from keys and values
    lineDict = dict(zip(keys, values))
    dataDict = {}
    for i, item in enumerate(headerList):
        dataDict[item] = {}
        for key in lineDict:
            dataDict[item][key] = lineDict[key][i]
    # Add 1.0 to every element in dataDict subdictionaries
    for keyMain in dataDict:
        for keySub in dataDict[keyMain]:
            dataDict[keyMain][keySub] += 1.0
    # Normalize original data (with 1 added) and update data
    valueSums = [sum(item)+4 for item in values]
    # print valueSums
    for keyMain in dataDict:
        for keySub in dataDict[keyMain]:
            dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]
    return dataDict

def parseData(fnSeq, dataset=1, key='>'):

    '''

    Read a formatted data file of sequences

    Return a list of sequences

    The first element in the list is the header

    '''
    # initialize output list
    dataList = []
    # open file for reading
    f = open(fn)
    # skip to required data set
    for _ in range(dataset):
        try:
            s = f.next()
            while not s.startswith(key):
                s = f.next()
        except StopIteration, e:
            print 'We have reached the end of the file!','sequence file'
            f.close()
            return False
    # initialize output list
    dataList = [s,]
    for line in f:
        if not line.startswith(key):
            dataList.append(line.strip())
        else:
            break
    f.close()
    return dataList

def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
    
   # sequence factor dictionary
    value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
    dataArray = parseArray(fnArray, arraySet)
   #print dataArray
    if dataArray:
        dataSeq = parseData(fnSeq, seqSet)
        #print dataSeq,"@@@@@@@@@@@@@@@@@@@"
        if not dataSeq:
            return False
    else:
        return None
    # This is the complete sequence
    seq = ''.join(dataSeq[1:])
    # These are the subkeys of dataArray - '01', '02', '03',.............
    subKeys = dataArray['A'].keys()
    subKeys.sort()
    # Calculate num/den for each slice of sequence
    # Each sequence slice length = length of subKeys
    # Example:
    # seq = 'ATCGATA'
    # subKeys length = 3
    # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'
    numList = []
    denList = []
    seqList = []
    for i in xrange(len(seq) - len(subKeys)):
        subseq = seq[0:len(subKeys)]
        seqList.append(subseq)
        num, den = 1, 1
        for j, s in enumerate(subseq):
            num *= dataArray[s][subKeys[j]]
            den *= value[s]
        numList.append(num)
        denList.append(den)
        seq = seq[1:]
    resultList = []
    for i, num in enumerate(numList):
        #p=log10(num/denList[i])
        #if (p) >=2:
            #print "#########",abs(int(p))
        if (log10(num/denList[i]))>=2:
            #print "i am here"
            resultList.append(int(abs(1)))
    #print resultList
    #for i in resultList:
	#mean=sum(resultList)/len(resultList)
        #sub=mean-i
        #queue = []
        #queue = (sub)**2
        #print sqrt(queue/len(resultList))
	
    #print mean,"@@@@@@@@@@"
    outStr = '\n'.join(['Sequence = %s Calculation = %d' % (seqList[i], res) for i, res in enumerate(resultList)])
    #print "this is line 294"
    return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
    
  
if __name__ == '__main__':
    import os
    fnArray =r'c:\python25\weightmatrix.transfac.txt'
    outputfile=r'c:\python25\output_file.txt'
    masterList=[]
    dir_name=r'c:\python25\up'
    fileList=[os.path.join(dir_name,f) for f in os.listdir(dir_name) if os.path.isfile(os.path.join(dir_name,f))]
    for fnSeq in fileList:
        outList=[]
        calcdata=1
        arraySet=1
        while not calcdata is None:
            seqSet=1
            while True:
                calcdata=compileData(fnArray,fnSeq,arraySet,seqSet)
                if calcdata:
                    outList.append(calcdata)
                    print calcdata
                    seqSet += 1
                    #print seqSet,""################"
                else:
                    break
            arraySet+=1
        masterList.append('\n'.join(outList))
    #f=open(outputfile,'W')
    #fwrite('\n'.join(masterList))
    #f.close()

please paste ur output file!
i have been stuck for a longtime now. the file should give you values 1 with sequence parts corresponding to it.
waiting for your reply,
cheers!

**aboxylica** · Dec 16 '07, 07:06 PM

oh........ i got the problem! Thanks

**aboxylica** · Dec 17 '07, 08:59 AM

i am encountering a problem when i run my code for a huge folder containing many sequence files.. this is what it says
Traceback (most recent call last):
File "this_final_11. py", line 163, in <module>
calcdata=compil eData(fnArray,f nSeq,arraySet,s eqSet)
File "this_final_11. py", line 121, in compileData
num *= dataArray[s][subKeys[j]]
KeyError: 'N'

Code:

def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
        
    # sequence factor dictionary
    value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
    dataArray = parseArray(fnArray, arraySet)
   #print dataArray
    if dataArray:
        dataSeq = parseData(fnSeq, seqSet)
        #print dataSeq,"@@@@@@@@@@@@@@@@@@@"
        if not dataSeq:
            return False
    else:
        return None
    # This is the complete sequence
    seq = ''.join(dataSeq[1:])
    # These are the subkeys of dataArray - '01', '02', '03',.............
    subKeys = dataArray['A'].keys()
    subKeys.sort()
    # Calculate num/den for each slice of sequence
    # Each sequence slice length = length of subKeys
    # Example:
    # seq = 'ATCGATA'
    # subKeys length = 3
    # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'
    numList = []
    denList = []
    seqList = []
    for i in xrange(len(seq) - len(subKeys)):
        subseq = seq[0:len(subKeys)]
        seqList.append(subseq)
        num, den = 1, 1
        for j, s in enumerate(subseq):
            num *= dataArray[s][subKeys[j]]
            print num,"######",dataArray[s][subKeys[j]],"#########"
            den *= value[s]
        numList.append(num)
        denList.append(den)
        seq = seq[1:]
    resultList = []
    for i, num in enumerate(numList):
        #p=log10(num/denList[i])
        #if (p) >=2:
            #print "#########",abs(int(p))
        if (log10(num/denList[i]))>=2:
            #print "i am here"
            resultList.append(int(abs(1)))
    #print resultList
    #for i in resultList:
	#mean=sum(resultList)/len(resultList)
        #sub=mean-i
        #queue = []
        #queue = (sub)**2
        #print sqrt(queue/len(resultList))
	
    #print mean,"@@@@@@@@@@"
    outStr = '\n'.join(['Sequence = %s Calculation = %d' % (seqList[i], res) for i, res in enumerate(resultList)])
    #print "this is line 294"
    return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)

can u see the problem.. i don get it

**bvdet** · Dec 17 '07, 02:26 PM

It appears that you have an 'N' in your sequence data. There is no such key in dataArray. Check your data files.

**aboxylica** · Dec 18 '07, 07:17 AM

okay.. i have to say that if it encounters "N" the value of this alphabet should be taken as "0"... I am able to do this for denominator.. but for the numerator being calculated.. i want to specify that the log value of "N" is zero .. and if many "NNNNN" come together.. this part of "NNNN" can be omitted.. how do i do this??

**aboxylica** · Dec 18 '07, 07:47 AM

this is what am doing and it doesnt seem to solve the problem.

Code:

numList = []



    denList = []
    seqList = []
    for i in xrange(len(seq) - len(subKeys)):
        subseq = seq[0:len(subKeys)]
        seqList.append(subseq)
        num, den = 1, 1
        for j, s in enumerate(subseq):
            if s=="N":
                value[s]=0
                
                
                num *= dataArray[s][subKeys[j]]
                den *= value[s]
        numList.append(num)
        denList.append(den)

seq = seq[1:]

**bvdet** · Dec 18 '07, 02:41 PM

Originally posted by aboxylica

this is what am doing and it doesnt seem to solve the problem.

Code:

numList = []



    denList = []
    seqList = []
    for i in xrange(len(seq) - len(subKeys)):
        subseq = seq[0:len(subKeys)]
        seqList.append(subseq)
        num, den = 1, 1
        for j, s in enumerate(subseq):
            if s=="N":
                value[s]=0
                
                
                num *= dataArray[s][subKeys[j]]
                den *= value[s]
        numList.append(num)
        denList.append(den)

seq = seq[1:]

You can test for a valid key and perform some alternative action if an invalid key is encountered:[code=Python] for j, s in enumerate(subse q):
if s in 'ACGT':
num *= dataArray[s][subKeys[j]]
den *= value[s]
else:
..............
numList.append( num)
denList.append( den)[/code]Be careful with setting the variable den to zero, because the value will be used as a divisor later and will cause a ZeroDivisionErr or.

**aboxylica** · Dec 19 '07, 08:30 AM

my code is running successffully.. my code ran for five hours.results were ok

but when its writing to a file it said.

We have reached the end of the file! sequence file

We have reached the end of the file! matrix file

Traceback (most recent call last):

File "C:\Python25\th is_final_1.py", line 174, in <module>

f.write('\n'.jo in(masterList))

MemoryError

i know its a very huge file but is there an efficient way to solve this problem??

**aboxylica** · Dec 19 '07, 10:26 AM

And i also think that this program should not take such a longtime to run..though my sequence folder is 18MB and array file is 69KB. though it is pretty huge the calculation is simple only.. can i compute how long it will typically take to calculate??
I think there is some serious error
waiting for ur reply,
cheers!!

looping through a big file containing a set of files.

Comment

Comment

Comment

Comment

Comment

Comment

Comment

Comment

Comment

Comment

Comment

Comment

Comment

Comment

Comment