looping through a big file containing a set of files.

Collapse
X
 
  • Time
  • Show
Clear All
new posts
  • aboxylica
    New Member
    • Jul 2007
    • 111

    #76
    hey!
    That code seens to work but the problem is it is checking only for the first six files... i donno where is the looping problem!!
    can you see it??
    Code:
    from math import *
    def parseArray(fn, dataset=1, key='PO', term='/'):
    
        '''
    
        Read a formatted data file in matrix format and
    
        compile data into a dictionary
    
        '''
    
        f = open(fn)
    
     
    
        # skip to required data set
    
        for _ in range(dataset):
        
    
            try:
    
                line = f.next()
    
                while not line.startswith(key):
    
                    line = f.next()
    
            except StopIteration, e:
    
                print 'We have reached the end of the file!'
    
                f.close()
    
                return False
    
     
    
        headerList = line.strip().split()[1:]
        
    
        lineList = []
    
     
    
        line = f.next().strip()
    
        while not line.startswith(term):
    
            if line != '':
    
                lineList.append(line.strip().split())
    
    
            line = f.next().strip()
    
     
    
        f.close()
    
     
    
        # Key list
    
        keys = [i[0] for i in lineList]
    
        # Values list
    
        values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]
    
     
    
        # Create a dictionary from keys and values
    
        lineDict = dict(zip(keys, values))
    
     
    
        dataDict = {}
    
     
    
        for i, item in enumerate(headerList):
    
            dataDict[item] = {}
    
            for key in lineDict:
    
                dataDict[item][key] = lineDict[key][i]
    
     
    
        # Add 1.0 to every element in dataDict subdictionaries
    
        for keyMain in dataDict:
    
            for keySub in dataDict[keyMain]:
    
                dataDict[keyMain][keySub] += 1.0
    
     
    
        # Normalize original data (with 1 added) and update data
    
        valueSums = [sum(item)+4 for item in values]
    
        # print valueSums
    
     
    
        for keyMain in dataDict:
    
            for keySub in dataDict[keyMain]:
                dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]
    
        return dataDict
    
     
    
     
    
    def parseData(fnSeq, dataset=1, key='>'):
    
        '''
    
        Read a formatted data file of sequences
    
        Return a list of sequences
    
        The first element in the list is the header
    
        '''   
    
        # initialize output list
    
        dataList = []
    
       
    
        # open file for reading
    
        f = open(fnSeq)
    
       
    
        # skip to required data set
    
        for _ in range(dataset):
    
    
            try:
    
                s = f.next()
    
                while not s.startswith(key):
                
    
                    s = f.next()
    
            except StopIteration, e:
    
                print 'We have reached the end of the file!'
    
                f.close()
    
                return False
    
     
    
        # initialize output list
    
        dataList = [s,]
    
           
        for line in f:
    
            if not line.startswith(key):
    
                dataList.append(line.strip())
    
            else:
    
                break
    
     
    
        f.close()
    
        return dataList
    
    
     
    
       
    def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
       
        # sequence factor dictionary
       
        value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
       
             
       
        dataArray = parseArray(fnArray, arraySet)
    
       
        if dataArray:
    
            dataSeq = parseData(fnSeq, seqSet)
    
       
            if not dataSeq:
       
                return False
      
        else:
      
            return None
        
      
             
      
        # This is the complete sequence 
      
        seq = ''.join(dataSeq[1:])
    
        
        
    
    
        # These are the subkeys of dataArray - '01', '02', '03',.............
      
        subKeys = dataArray['A'].keys()
    
        subKeys.sort()
    
        
      
           
      
        # Calculate num/den for each slice of sequence
      
        # Each sequence slice length = length of subKeys
      
        # Example:
        # seq = 'ATCGATA'
      
        # subKeys length = 3
      
        # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'
    
        numList = []
      
        denList = []
      
        seqList = []
      
        for i in xrange(len(seq) - len(subKeys)):
      
            subseq = seq[0:len(subKeys)]
      
            seqList.append(subseq)
    
      
            num, den = 1, 1
      
            for j, s in enumerate(subseq):
      
                num *= dataArray[s][subKeys[j]]
      
                den *= value[s]
      
            numList.append(num)
      
            denList.append(den)
      
            seq = seq[1:]
      
           
        
        resultList = []
      
        for i, num in enumerate(numList):
            #p=log10(num/denList[i])
            #if (p) >=2:
                #print "#########",abs(int(p))
            #if (log10(num/denList[i]))>=2:
                #print "i am here"
    	    resultList.append(int(abs(1)))
        #print resultList
        #for i in resultList:
    	#mean=sum(resultList)/len(resultList)
            #sub=mean-i
            #queue = []
            #queue = (sub)**2
            #print sqrt(queue/len(resultList))
    	
        #print mean,"@@@@@@@@@@"
    	
            
       
           
      
        outStr = '\n'.join(['Sequence = %s Calculation = %d' % (seqList[i], res) for i, res in enumerate(resultList)])
        #print "this is line 294"
        
      
        return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
        
      
    if __name__ == '__main__':
      
    
        fnArray ='one_redfly.transfac'
        fnSeq = 'deepthi/upstream_regions'
        import os
        dir_name='upstream_regions'
        fList=os.listdir(dir_name)
        fList1=[os.path.join(dir_name,f) for f in fList if os.path.isfile(os.path.join(dir_name,f))]
        seqSetIndex=0
        fnSeq=fList1[seqSetIndex]
    
        while True:
            
        
        
      
            outputfile =  "sequence_calc_data.txt"
      
             
      
            arraySet = 1
      
            outList = []
      
            calcdata = 1
      
            while not calcdata is None:
      
                seqSet = 1
      
                while True:
      
                    calcdata = compileData(fnArray, fnSeq, arraySet, seqSet)
                    print calcdata
      
                    if calcdata:
      
                        outList.append(calcdata)
      
                        seqSet += 1
      
                    else:
      
                        break
      
                arraySet += 1
                seqSetIndex+=1
            else:
                break
        
    
            
      
           
      
        f = open(outputfile, 'w')
      
        f.write('\n'.join(outList))
      
        f.close()
        #f=open(outputfile,"r")
        #file_con=f.readlines()
        #for line in file_con:
         #   print line
    waiting for ur reply!
    cheers!!

    Comment

    • aboxylica
      New Member
      • Jul 2007
      • 111

      #77
      when the seqset is becoming 6.. it is coming out of the loop..why is this happening???

      Comment

      • bvdet
        Recognized Expert Specialist
        • Oct 2006
        • 2851

        #78
        This code is untested, but I think is close to what you are trying to do:[code=Python]if __name__ == '__main__':
        import os

        fnArray = 'array_file'
        outputfile = 'output_file'
        masterList = []

        # user has multiple sequence files
        # and must iterate over all the files
        dir_name = 'your_directory '
        fileList = [os.path.join(di r_name,f) for f in os.listdir(dir_ name)\
        if os.path.isfile( os.path.join(di r_name,f))]

        for fnSeq in fileList:
        outList = []
        calcdata = 1
        while not calcdata is None:
        seqSet = 1
        while True:
        calcdata = compileData(fnA rray, fnSeq, arraySet, seqSet)
        if calcdata:
        outList.append( calcdata)
        seqSet += 1
        else:
        break
        arraySet += 1
        masterList.appe nd('\n'.join(ou tList))
        f = open(outputfile , 'w')
        f.write('\n'.jo in(masterList))
        f.close()
        [/code]

        Comment

        • aboxylica
          New Member
          • Jul 2007
          • 111

          #79
          i donno the code is not executing.. sequence set is not at all incrementing..
          when i print seqSet.. it is always one..it is not incrementing and the resultlist is empty.
          i have been trying.. there is some problem in writing it to a file also..
          Code:
          from math import *
          def parseArray(fn, dataset=1, key='PO', term='/'):
          
              '''
          
              Read a formatted data file in matrix format and
          
              compile data into a dictionary
          
              '''
          
              f = open(fn)
          
           
          
              # skip to required data set
          
              for _ in range(dataset):
              
          
                  try:
          
                      line = f.next()
          
                      while not line.startswith(key):
          
                          line = f.next()
          
                  except StopIteration, e:
          
                      #print 'We have reached the end of the file!'
          
                      f.close()
          
                      return False
          
           
          
              headerList = line.strip().split()[1:]
              
          
              lineList = []
          
           
          
              line = f.next().strip()
          
              while not line.startswith(term):
          
                  if line != '':
          
                      lineList.append(line.strip().split())
          
          
                  line = f.next().strip()
          
           
          
              f.close()
          
           
          
              # Key list
          
              keys = [i[0] for i in lineList]
          
              # Values list
          
              values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]
          
           
          
              # Create a dictionary from keys and values
          
              lineDict = dict(zip(keys, values))
          
           
          
              dataDict = {}
          
           
          
              for i, item in enumerate(headerList):
          
                  dataDict[item] = {}
          
                  for key in lineDict:
          
                      dataDict[item][key] = lineDict[key][i]
          
           
          
              # Add 1.0 to every element in dataDict subdictionaries
          
              for keyMain in dataDict:
          
                  for keySub in dataDict[keyMain]:
          
                      dataDict[keyMain][keySub] += 1.0
          
           
          
              # Normalize original data (with 1 added) and update data
          
              valueSums = [sum(item)+4 for item in values]
          
              # print valueSums
          
           
          
              for keyMain in dataDict:
          
                  for keySub in dataDict[keyMain]:
                      dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]
          
              return dataDict
          
           
          
           
          
          def parseData(fnSeq, dataset=1, key='>'):
          
              '''
          
              Read a formatted data file of sequences
          
              Return a list of sequences
          
              The first element in the list is the header
          
              '''   
          
              # initialize output list
          
              dataList = []
          
             
          
              # open file for reading
          
              f = open(fn)
          
             
          
              # skip to required data set
          
              for _ in range(dataset):
          
          
                  try:
          
                      s = f.next()
          
                      while not s.startswith(key):
                      
          
                          s = f.next()
          
                  except StopIteration, e:
          
                      #print 'We have reached the end of the file!'
          
                      f.close()
          
                      return False
          
           
          
              # initialize output list
          
              dataList = [s,]
          
                 
              for line in f:
          
                  if not line.startswith(key):
          
                      dataList.append(line.strip())
          
                  else:
          
                      break
          
           
          
              f.close()
          
              return dataList
          
          
           
          
             
          def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
             
              # sequence factor dictionary
             
              value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
             
                   
             
              dataArray = parseArray(fnArray, arraySet)
              print dataArray
          
             
              if dataArray:
          
                  dataSeq = parseData(fnSeq, seqSet)
                  #print dataSeq,"@@@@@@@@@@@@@@@@@@@"
          
             
                  if not dataSeq:
             
                      return False
            
              else:
            
                  return None
              
            
                   
            
              # This is the complete sequence 
            
              seq = ''.join(dataSeq[1:])
          
              
              
          
          
              # These are the subkeys of dataArray - '01', '02', '03',.............
            
              subKeys = dataArray['A'].keys()
          
              subKeys.sort()
          
              
            
                 
            
              # Calculate num/den for each slice of sequence
            
              # Each sequence slice length = length of subKeys
            
              # Example:
              # seq = 'ATCGATA'
            
              # subKeys length = 3
            
              # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'
          
              numList = []
            
              denList = []
            
              seqList = []
            
              for i in xrange(len(seq) - len(subKeys)):
            
                  subseq = seq[0:len(subKeys)]
            
                  seqList.append(subseq)
          
            
                  num, den = 1, 1
            
                  for j, s in enumerate(subseq):
            
                      num *= dataArray[s][subKeys[j]]
            
                      den *= value[s]
            
                  numList.append(num)
            
                  denList.append(den)
            
                  seq = seq[1:]
            
                 
              
              resultList = []
            
              for i, num in enumerate(numList):
                  #p=log10(num/denList[i])
                  #if (p) >=2:
                      #print "#########",abs(int(p))
                  if (log10(num/denList[i]))>=2:
                      #print "i am here"
          	    resultList.append(int(abs(1)))
              #print resultList
              #for i in resultList:
          	#mean=sum(resultList)/len(resultList)
                  #sub=mean-i
                  #queue = []
                  #queue = (sub)**2
                  #print sqrt(queue/len(resultList))
          	
              #print mean,"@@@@@@@@@@"
          	
                  
             
                 
            
              outStr = '\n'.join(['Sequence = %s Calculation = %d' % (seqList[i], res) for i, res in enumerate(resultList)])
              #print "this is line 294"
              
            
              return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
              
            
          if __name__ == '__main__':
              import os
              
            
          
              fnArray ='half.txt'
              outputfile='output_file'
              masterList=[]
              dir_name='New Folder'
              fileList=[os.path.join(dir_name,f) for f in os.listdir(dir_name) if os.path.isfile(os.path.join(dir_name,f))]
              
              
              
              for fnSeq in fileList:
                  outList=[]
                  calcdata=1
                  arraySet=1
                  
                  while not calcdata is None:
                      
                      seqSet=1
                      while True:
                          print "am here"
                          
                          calcdata=compileData(fnArray,fnSeq,arraySet,seqSet)
                          
                          if calcdata:
                              outList.append(calcdata)
                              #print outList
                              seqSet += 1
                              print seqSet,""################"
                          else:
                              break
                      arraySet+=1
                  masterList.append('\n'.join(outList))
              #f=open(outputfile,'W')
              #fwrite('\n'.join(masterList))
              #f.close()
          waiting for ur reply
          cheers!

          Comment

          • bvdet
            Recognized Expert Specialist
            • Oct 2006
            • 2851

            #80
            There is no need to post the entire code repeatedly. Presumably the code before "if __name__ == '__main__':" works. I assume that you have one array file and multiple sequence files. You want to iterate on each array set in the array file on each sequence set for each sequence file in a separate directory. The following code works for me given the previous assumptions:[code=Python]if __name__ == '__main__':
            import os

            fnArray = r'H:\array.txt'
            outputfile = r'H:\seq_array_ output.txt'
            masterList = []

            # user has multiple sequence files
            # and must iterate over all the files
            dir_name = r'H:\TEMP\seque nce_files
            fileList = [os.path.join(di r_name,f) for f in os.listdir(dir_ name)\
            if os.path.isfile( os.path.join(di r_name,f))]

            for fnSeq in fileList:
            outList = []
            calcdata = 1
            arraySet = 1
            while not calcdata is None:
            seqSet = 1
            while True:
            calcdata = compileData(fnA rray, fnSeq, arraySet, seqSet)
            if calcdata:
            outList.append( calcdata)
            seqSet += 1
            else:
            break
            arraySet += 1
            masterList.appe nd('\n'.join(ou tList))
            f = open(outputfile , 'w')
            f.write('\n'.jo in(masterList))
            f.close() [/code]Following is a snippet from the array file:
            NA bin
            PO A C G T
            01 0.45 8.27 0.00 11.39
            02 0.00 0.00 10.02 10.09
            03 5.80 1.39 0.00 12.93
            //
            //
            NA bap
            PO A C G T
            01 0.00 3.67 0.00 0.00
            02 0.00 0.00 3.67 0.00
            03 0.00 0.00 0.00 3.67
            04 0.00 3.67 0.00 0.00
            //
            //
            NA bcd
            PO A C G T
            01 42.55 8.75 145.86 8.14
            02 0.14 0.53 204.64 0.00
            03 126.83 78.02 0.11 0.34
            04 0.21 0.17 0.00 204.92
            05 0.00 12.38 0.43 192.50
            06 174.48 0.95 1.32 28.56
            07 79.53 4.70 100.44 20.64
            //
            //
            Following is a snippet from one of the sequence files:
            >CG9571_O-E|Drosophila melanogaster|CG 9571|FBgn003108 6|X:19926374..1 9927133
            CCAGTCCACCGGCCG CCGATCTATTTATAC GAGAGGAAGAGGCTG AACTCGAGGATTACC CGTGTATCCTGGGAC GCG
            GATTAGCGATCCATT CCCCTTTTAATCGCC GCGCAAACAGATTCA TGAAAGCCTTCGGAT TCATTCATTGATCCA CAT
            CGTTTTTCTGTTATT AATAAAAAACCAATA GGAAAGTTCTCAAAA ATTACTCTGTTGTAT TTGATCATTTCTTTT CCG
            GTATAATCTTTTATT TTAAGCATTCCCATG TGAATAAATTTCAGA CTAATGTATTAATAA GATGTCGTGTTTTTC CAC
            TTACAAATTTCTCAT ACAGCTGGATATATA CTACGAGTACTATAC ACATGCTCTGGG
            >Cp36_DRR|Droso phila melanogaster|Cp 36|FBgn0000359| X:8323349..8324 136
            AGTCGACCAGCACGA GATCTCACCTACCTT CTTTATAAGCGGGGT CTCTAGAAGCTAAAT CCATGTCCACGTCAA ACC
            AAAGACTTGCGGTCT CCAGACCATTGAGTT CTATAAATGGGACTG AGCCACACCATACAC CACACACCACACATA CAC
            AACATTG
            >Cp36_PRR|Droso phila melanogaster|Cp 36|FBgn0000359| X:8324430..8324 513
            TCTAGAGATCTGGGC ACGATGGCGAGACAA AGATGCGGCGCAAAA TCGGAAATGGAGATG GATCACGTAGCCGGC CAT
            GGCGG
            >Him_distal|Dro sophila melanogaster|Hi m|FBgn0030900|X :18039896..1804 3470
            GGTTTTCTGCGATGG CTTCCGCGCCAGCTG AAGTATCTGATTTGC TGCCTTGTTTTTGTT GATATTTCTGCGAAG GGA
            CTTGTGCTTTTCAAA TGGCCTTTTTTTGGG ATTACGGCAAGGGCG CGTTTCCCACGCTCG ATCCCCACTTACCAT TGG
            TGCACGCGATTGCGG CAAGCTGCTGAGGCA AGCTATTAAACGCCA CACTGGGCCGGGGGG CGGTACCGGTGGGCG TGG
            AATATATAGATCGGA GATATCGCAGGACCC ACAGCAGAGCAGAGC CGCAGAGCCACCAAC CTCG
            I don't know what else I can do! Maybe you should check if your defined functions are working as expected.

            Comment

            • aboxylica
              New Member
              • Jul 2007
              • 111

              #81
              I sort of took away the spaces and tried executing.. the other functions are correct. please execute this code with the inputs i paste now and please paste ur o/p file
              my directory containing the sequence files is called "up"

              this contains 24 files.. but let me paste the first two files.
              first file called YAL012w contains the sequence
              >Scer_YAL012W SGDID:S0000010 5' untranslated region, Chr I 130534 - 130801, 268 bp
              ATATGTTGTAACGCT ATGTAATTCCACCCT TCATTACTAATAATT AGCCATTCACGTGAT CTCAGCCAGTTGTGG CGCCACACTTTTTTT TCCATAAAAATCCTC GAGGAAAAGAAAAGA AAAAAATATTTCAGT TATTTAAAGCATAAG ATGCCAGGTAGATGG AACTTGTGCCGTGCC AGATTGAATTTTGAA AGTACAATTGAGGCC TATACACATAGACAT TTGCACCTTATACAT ATACACACAAGACAA AACCAAAAAAAAT
              >Smik YAL012W c875:991..1258
              GCCACACCGTCTCTA CACAGCTTCTTGTTC TTGTTATTCCTAAAT ATGTTGTAACGCTAC GTAATTCCACCTTTC ATTACTAATATTAAG CCATTCACGTGACTT TTTCCCAGCGCCTGG TGTGGCGCCACACTT TTTTTTCATAAGAAT CCTCGAGGAAAACAG AAAAATATTTCAGTT ATTTAAACTATAAGA TGCTTGGGGATATAA TGATGGTGCCGTGTG TTTGTGTTTGAGTGT GTTTGAGTGTTCTGA AAGAGAGAGC
              >Spar YAL012W c218:45412..456 79 (rev)
              TTCTTAAATATGTTG TAACGCTATGTAATT CCACCCTTCATTACT AATAATTAGCCATTC ACGTGATACCAGCCA GCTGTGGCGCCACAC TTTTTTTCCATAAAA ATCCTCGAGGAAAAG AAAAGAAAAAATATT TCAGTTATTTAAACC AAAAGATGCCTGGGA GATAGAGCTGGTGCC GTGTGAAATCCAGTT TTGAAAGTGCAATTG AGTCCTACACACATA AGCATCCGCACCTTG TATACATATACGCCA AACAAAACAAAAA
              and the second file is called YAl013 and contains the following sequence
              >Scer_YAL013W SGDID:S0000011 5' untranslated region, Chr I 129021 - 129271, 251 bp
              TACCGTGTGTGCTTA CGCATCTATTTGCTG TCGTGAGATCTGTCT CTATGCTTTATTCGT TTTCCATTGTAAAGT CTCAGCATTTATTTC TTGTTCTTTACTTGT TTTTGCCCTTTCGGG TGATCAAAGTCGTGC TGGGAAATTTTATTC TTATAAAATGATTTT TAGAAATAATAAACT CATAACAGTGCAACG GCAAAGTACAAGGGA AGGAAGCACAGAAGC AAGAGGAGGCGCATC GATCGTGGCAG
              >Sbay YAL013W c669:46323..465 74 (rev)
              ACGTCCATCCCGTGT GTGCTTGCACATCTG CTTGGCCTGCGAGCC GTCCTCTGGGCCATC CGCTCCCTATTGTGA AGTACCAGCCTTTGT TCCTGCTCTTCATAG TTTGCTCTTTCGGGT GATGAAAGTTGGGCT GGAAATTTTATCCCA AAGGACAATATGAAA ATAAACTCGAAACTG TGTAGCAACCAAGAG GGAAGGGGAAAGGGA GAGACAGGAGCAAGA GGAAGCATCGGTCGT GGCAGATGAGTC
              >Spar YAL013W c218:46936..471 86 (rev)
              ATACCGTGTGTGCTT ACGCATCTATTTGTT CTCGTGAGCCCCGTC TCTGGACTTTATCCG TTCGCTATTGTAAAG TCTTAGCCTTTATCT CTAGTTCTTTACCTG CTTTTGCTCTTTCGG GTGATGAAAGTTGCG CTGGGAAATTTTATT CCCATAGAATGATTT TTAGAAATAATAAAC TCATAACAGTGCAAC GACCAAGTGCAGGGG AAAGTAGCATAGAAG CAAGAGGAGACGCAT CGATCGCGGCA

              the matrix file am using is called weight matrix.transfac .txt
              //
              NA Abd-B
              PO A C G T
              01 10.19 0.00 10.65 6.24
              02 5.79 0.67 10.50 10.11
              03 4.50 0.00 0.00 22.57
              04 0.00 0.00 0.00 27.08
              05 0.00 0.00 0.00 27.08
              06 0.00 0.00 0.00 27.08
              07 27.08 0.00 0.00 0.00
              08 0.00 2.83 0.00 24.25
              09 0.00 0.00 24.45 2.62
              10 19.33 0.00 4.34 3.41
              11 0.31 12.28 3.39 11.09
              //
              //
              NA Adf1
              PO A C G T
              01 0.71 0.08 26.02 1.55
              02 3.03 23.00 1.24 1.09
              03 0.26 10.50 3.29 14.31
              04 0.00 0.06 28.23 0.07
              05 0.12 27.27 0.06 0.91
              06 1.44 20.36 0.37 6.19
              07 5.35 0.28 21.49 1.24
              08 7.81 16.10 3.81 0.63
              09 0.51 17.77 0.45 9.63
              10 0.00 0.14 28.21 0.00
              11 0.00 25.69 0.20 2.46
              12 0.48 9.98 0.07 17.82
              13 1.27 0.00 27.01 0.07
              14 15.59 7.98 2.92 1.87
              15 4.28 22.37 0.00 1.70
              16 0.18 0.77 22.70 4.70
              //
              //
              NA Aef1
              PO A C G T
              01 0.00 0.06 12.49 0.00
              02 3.80 0.17 0.00 8.57
              03 0.87 0.06 0.00 11.62
              04 0.06 9.76 2.32 0.41
              05 9.82 0.00 2.73 0.00
              06 9.76 0.00 0.00 2.78
              07 3.80 0.31 0.00 8.43
              08 0.00 0.00 0.00 12.54
              09 0.00 6.53 5.85 0.17
              10 0.00 12.38 0.17 0.00
              11 2.73 1.02 8.80 0.00
              12 5.85 0.00 6.70 0.00
              13 1.02 5.96 0.00 5.57
              14 0.00 5.16 4.66 2.73
              15 1.03 7.55 3.97 0.00
              16 4.82 5.00 2.73 0.00
              //
              //

              Comment

              • aboxylica
                New Member
                • Jul 2007
                • 111

                #82
                and here is the code with spaces removed!
                Code:
                from math import *
                print "hellllllllo"
                def parseArray(fn, dataset=1, key='PO', term='/'):
                
                    '''
                
                    Read a formatted data file in matrix format and
                
                    compile data into a dictionary
                
                    '''
                    f = open(fn)
                    # skip to required data set
                    for _ in range(dataset):
                        try:
                            line = f.next()
                            while not line.startswith(key):
                                line = f.next()
                        except StopIteration, e:
                            print 'We have reached the end of the file!','matrix file'
                            f.close()
                            return False
                    headerList = line.strip().split()[1:]
                    lineList = []
                    line = f.next().strip()
                    while not line.startswith(term):
                        if line != '':
                            lineList.append(line.strip().split())
                    line = f.next().strip()
                    f.close()
                    # Key list
                    keys = [i[0] for i in lineList]
                    # Values list
                    values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]
                    # Create a dictionary from keys and values
                    lineDict = dict(zip(keys, values))
                    dataDict = {}
                    for i, item in enumerate(headerList):
                        dataDict[item] = {}
                        for key in lineDict:
                            dataDict[item][key] = lineDict[key][i]
                    # Add 1.0 to every element in dataDict subdictionaries
                    for keyMain in dataDict:
                        for keySub in dataDict[keyMain]:
                            dataDict[keyMain][keySub] += 1.0
                    # Normalize original data (with 1 added) and update data
                    valueSums = [sum(item)+4 for item in values]
                    # print valueSums
                    for keyMain in dataDict:
                        for keySub in dataDict[keyMain]:
                            dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]
                    return dataDict
                
                def parseData(fnSeq, dataset=1, key='>'):
                
                    '''
                
                    Read a formatted data file of sequences
                
                    Return a list of sequences
                
                    The first element in the list is the header
                
                    '''
                    # initialize output list
                    dataList = []
                    # open file for reading
                    f = open(fn)
                    # skip to required data set
                    for _ in range(dataset):
                        try:
                            s = f.next()
                            while not s.startswith(key):
                                s = f.next()
                        except StopIteration, e:
                            print 'We have reached the end of the file!','sequence file'
                            f.close()
                            return False
                    # initialize output list
                    dataList = [s,]
                    for line in f:
                        if not line.startswith(key):
                            dataList.append(line.strip())
                        else:
                            break
                    f.close()
                    return dataList
                
                def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
                    
                   # sequence factor dictionary
                    value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
                    dataArray = parseArray(fnArray, arraySet)
                   #print dataArray
                    if dataArray:
                        dataSeq = parseData(fnSeq, seqSet)
                        #print dataSeq,"@@@@@@@@@@@@@@@@@@@"
                        if not dataSeq:
                            return False
                    else:
                        return None
                    # This is the complete sequence
                    seq = ''.join(dataSeq[1:])
                    # These are the subkeys of dataArray - '01', '02', '03',.............
                    subKeys = dataArray['A'].keys()
                    subKeys.sort()
                    # Calculate num/den for each slice of sequence
                    # Each sequence slice length = length of subKeys
                    # Example:
                    # seq = 'ATCGATA'
                    # subKeys length = 3
                    # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'
                    numList = []
                    denList = []
                    seqList = []
                    for i in xrange(len(seq) - len(subKeys)):
                        subseq = seq[0:len(subKeys)]
                        seqList.append(subseq)
                        num, den = 1, 1
                        for j, s in enumerate(subseq):
                            num *= dataArray[s][subKeys[j]]
                            den *= value[s]
                        numList.append(num)
                        denList.append(den)
                        seq = seq[1:]
                    resultList = []
                    for i, num in enumerate(numList):
                        #p=log10(num/denList[i])
                        #if (p) >=2:
                            #print "#########",abs(int(p))
                        if (log10(num/denList[i]))>=2:
                            #print "i am here"
                            resultList.append(int(abs(1)))
                    #print resultList
                    #for i in resultList:
                	#mean=sum(resultList)/len(resultList)
                        #sub=mean-i
                        #queue = []
                        #queue = (sub)**2
                        #print sqrt(queue/len(resultList))
                	
                    #print mean,"@@@@@@@@@@"
                    outStr = '\n'.join(['Sequence = %s Calculation = %d' % (seqList[i], res) for i, res in enumerate(resultList)])
                    #print "this is line 294"
                    return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
                    
                  
                if __name__ == '__main__':
                    import os
                    fnArray =r'c:\python25\weightmatrix.transfac.txt'
                    outputfile=r'c:\python25\output_file.txt'
                    masterList=[]
                    dir_name=r'c:\python25\up'
                    fileList=[os.path.join(dir_name,f) for f in os.listdir(dir_name) if os.path.isfile(os.path.join(dir_name,f))]
                    for fnSeq in fileList:
                        outList=[]
                        calcdata=1
                        arraySet=1
                        while not calcdata is None:
                            seqSet=1
                            while True:
                                calcdata=compileData(fnArray,fnSeq,arraySet,seqSet)
                                if calcdata:
                                    outList.append(calcdata)
                                    print calcdata
                                    seqSet += 1
                                    #print seqSet,""################"
                                else:
                                    break
                            arraySet+=1
                        masterList.append('\n'.join(outList))
                    #f=open(outputfile,'W')
                    #fwrite('\n'.join(masterList))
                    #f.close()
                please paste ur output file!
                i have been stuck for a longtime now. the file should give you values 1 with sequence parts corresponding to it.
                waiting for your reply,
                cheers!

                Comment

                • aboxylica
                  New Member
                  • Jul 2007
                  • 111

                  #83
                  oh........ i got the problem! Thanks

                  Comment

                  • aboxylica
                    New Member
                    • Jul 2007
                    • 111

                    #84
                    i am encountering a problem when i run my code for a huge folder containing many sequence files.. this is what it says
                    Traceback (most recent call last):
                    File "this_final_11. py", line 163, in <module>
                    calcdata=compil eData(fnArray,f nSeq,arraySet,s eqSet)
                    File "this_final_11. py", line 121, in compileData
                    num *= dataArray[s][subKeys[j]]
                    KeyError: 'N'
                    Code:
                    def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
                            
                        # sequence factor dictionary
                        value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
                        dataArray = parseArray(fnArray, arraySet)
                       #print dataArray
                        if dataArray:
                            dataSeq = parseData(fnSeq, seqSet)
                            #print dataSeq,"@@@@@@@@@@@@@@@@@@@"
                            if not dataSeq:
                                return False
                        else:
                            return None
                        # This is the complete sequence
                        seq = ''.join(dataSeq[1:])
                        # These are the subkeys of dataArray - '01', '02', '03',.............
                        subKeys = dataArray['A'].keys()
                        subKeys.sort()
                        # Calculate num/den for each slice of sequence
                        # Each sequence slice length = length of subKeys
                        # Example:
                        # seq = 'ATCGATA'
                        # subKeys length = 3
                        # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'
                        numList = []
                        denList = []
                        seqList = []
                        for i in xrange(len(seq) - len(subKeys)):
                            subseq = seq[0:len(subKeys)]
                            seqList.append(subseq)
                            num, den = 1, 1
                            for j, s in enumerate(subseq):
                                num *= dataArray[s][subKeys[j]]
                                print num,"######",dataArray[s][subKeys[j]],"#########"
                                den *= value[s]
                            numList.append(num)
                            denList.append(den)
                            seq = seq[1:]
                        resultList = []
                        for i, num in enumerate(numList):
                            #p=log10(num/denList[i])
                            #if (p) >=2:
                                #print "#########",abs(int(p))
                            if (log10(num/denList[i]))>=2:
                                #print "i am here"
                                resultList.append(int(abs(1)))
                        #print resultList
                        #for i in resultList:
                    	#mean=sum(resultList)/len(resultList)
                            #sub=mean-i
                            #queue = []
                            #queue = (sub)**2
                            #print sqrt(queue/len(resultList))
                    	
                        #print mean,"@@@@@@@@@@"
                        outStr = '\n'.join(['Sequence = %s Calculation = %d' % (seqList[i], res) for i, res in enumerate(resultList)])
                        #print "this is line 294"
                        return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
                    can u see the problem.. i don get it

                    Comment

                    • bvdet
                      Recognized Expert Specialist
                      • Oct 2006
                      • 2851

                      #85
                      It appears that you have an 'N' in your sequence data. There is no such key in dataArray. Check your data files.

                      Comment

                      • aboxylica
                        New Member
                        • Jul 2007
                        • 111

                        #86
                        okay.. i have to say that if it encounters "N" the value of this alphabet should be taken as "0"... I am able to do this for denominator.. but for the numerator being calculated.. i want to specify that the log value of "N" is zero .. and if many "NNNNN" come together.. this part of "NNNN" can be omitted.. how do i do this??

                        Comment

                        • aboxylica
                          New Member
                          • Jul 2007
                          • 111

                          #87
                          this is what am doing and it doesnt seem to solve the problem.
                          Code:
                          numList = []
                          
                          
                          
                              denList = []
                              seqList = []
                              for i in xrange(len(seq) - len(subKeys)):
                                  subseq = seq[0:len(subKeys)]
                                  seqList.append(subseq)
                                  num, den = 1, 1
                                  for j, s in enumerate(subseq):
                                      if s=="N":
                                          value[s]=0
                                          
                                          
                                          num *= dataArray[s][subKeys[j]]
                                          den *= value[s]
                                  numList.append(num)
                                  denList.append(den)
                          seq = seq[1:]

                          Comment

                          • bvdet
                            Recognized Expert Specialist
                            • Oct 2006
                            • 2851

                            #88
                            Originally posted by aboxylica
                            this is what am doing and it doesnt seem to solve the problem.
                            Code:
                            numList = []
                            
                            
                            
                                denList = []
                                seqList = []
                                for i in xrange(len(seq) - len(subKeys)):
                                    subseq = seq[0:len(subKeys)]
                                    seqList.append(subseq)
                                    num, den = 1, 1
                                    for j, s in enumerate(subseq):
                                        if s=="N":
                                            value[s]=0
                                            
                                            
                                            num *= dataArray[s][subKeys[j]]
                                            den *= value[s]
                                    numList.append(num)
                                    denList.append(den)
                            seq = seq[1:]
                            You can test for a valid key and perform some alternative action if an invalid key is encountered:[code=Python] for j, s in enumerate(subse q):
                            if s in 'ACGT':
                            num *= dataArray[s][subKeys[j]]
                            den *= value[s]
                            else:
                            ..............
                            numList.append( num)
                            denList.append( den)[/code]Be careful with setting the variable den to zero, because the value will be used as a divisor later and will cause a ZeroDivisionErr or.

                            Comment

                            • aboxylica
                              New Member
                              • Jul 2007
                              • 111

                              #89
                              my code is running successffully.. my code ran for five hours.results were ok


                              but when its writing to a file it said.


                              We have reached the end of the file! sequence file

                              We have reached the end of the file! matrix file

                              Traceback (most recent call last):

                              File "C:\Python25\th is_final_1.py", line 174, in <module>

                              f.write('\n'.jo in(masterList))

                              MemoryError

                              i know its a very huge file but is there an efficient way to solve this problem??

                              Comment

                              • aboxylica
                                New Member
                                • Jul 2007
                                • 111

                                #90
                                And i also think that this program should not take such a longtime to run..though my sequence folder is 18MB and array file is 69KB. though it is pretty huge the calculation is simple only.. can i compute how long it will typically take to calculate??
                                I think there is some serious error
                                waiting for ur reply,
                                cheers!!

                                Comment

                                Working...