looping through a big file containing a set of files.

Collapse
X
 
  • Time
  • Show
Clear All
new posts
  • aboxylica
    New Member
    • Jul 2007
    • 111

    #61
    Am getting a list index out of range error!!
    but I cant add a -1 to the loop.can i??whats wrong with this??

    Code:
    from math import *
    def parseArray(fn, dataset=1, key='PO', term='/'):
    
        '''
    
        Read a formatted data file in matrix format and
    
        compile data into a dictionary
    
        '''
    
        f = open(fn)
    
     
    
        # skip to required data set
    
        for _ in range(dataset):
        
    
            try:
    
                line = f.next()
    
                while not line.startswith(key):
    
                    line = f.next()
    
            except StopIteration, e:
    
                print 'We have reached the end of the file!'
    
                f.close()
    
                return False
    
     
    
        headerList = line.strip().split()[1:]
    
        lineList = []
    
     
    
        line = f.next().strip()
    
        while not line.startswith(term):
    
            if line != '':
    
                lineList.append(line.strip().split())
    
            line = f.next().strip()
    
     
    
        f.close()
    
     
    
        # Key list
    
        keys = [i[0] for i in lineList]
    
        # Values list
    
        values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]
    
     
    
        # Create a dictionary from keys and values
    
        lineDict = dict(zip(keys, values))
    
     
    
        dataDict = {}
    
     
    
        for i, item in enumerate(headerList):
    
            dataDict[item] = {}
    
            for key in lineDict:
    
                dataDict[item][key] = lineDict[key][i]
    
     
    
        # Add 1.0 to every element in dataDict subdictionaries
    
        for keyMain in dataDict:
    
            for keySub in dataDict[keyMain]:
    
                dataDict[keyMain][keySub] += 1.0
    
     
    
        # Normalize original data (with 1 added) and update data
    
        valueSums = [sum(item)+4 for item in values]
    
        # print valueSums
    
     
    
        for keyMain in dataDict:
    
            for keySub in dataDict[keyMain]:
                dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]
    
     
        return dataDict
    
     
    
     
    
    def parseData(fn, dataset=1, key='>'):
    
        '''
    
        Read a formatted data file of sequences
    
        Return a list of sequences
    
        The first element in the list is the header
    
        '''   
    
        # initialize output list
    
        dataList = []
    
       
    
        # open file for reading
    
        f = open(fn)
    
       
    
        # skip to required data set
    
        for _ in range(dataset):
    
            try:
    
                s = f.next()
    
                while not s.startswith(key):
    
                    s = f.next()
    
            except StopIteration, e:
    
                print 'We have reached the end of the file!'
    
                f.close()
    
                return False
    
     
    
        # initialize output list
    
        dataList = [s,]
    
           
        for line in f:
    
            if not line.startswith(key):
    
                dataList.append(line.strip())
    
            else:
    
                break
    
     
    
        f.close()
    
        return dataList
    
     
    
    if __name__ == '__main__':
    
     
    
        arraySet = 4
        #print arraySet
    
        seqSet = 4
        #print seqSet
    
     
    
        value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
    
          
    
        fnArray = r'all_redfly.transfac.txt'
    
        fnSeq = r'redfly_sequence.fasta'
        indxSeq=1
        while True:
            dataSeq=parseData(fnSeq,indxSeq)
            if not dataSeq:
                break
            indxArray=1
            while True:
                    dataArray = parseArray(fnArray, arraySet)
                    #dataSeq = parseData(fnSeq, seqSet)
                    if not dataArray:
                        break
                    # This is the complete sequence
                    seq = ''.join(dataSeq[1:])
                    # These are the subkeys of dataArray - '01', '02', '03',.............
                    subKeys = dataArray['A'].keys()
                    subKeys.sort()
    
     
    
        # Calculate num/den for each slice of sequence
    
        # Each sequence slice length = length of subKeys
    
        # Example:
    
        # seq = 'ATCGATA'
    
        # subKeys length = 3
    
        # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'
    
                    numList = []
    
                    denList = []
    
                    seqList = []
    
                    for i in xrange(len(seq) - len(subKeys) + 1):
    
                        subseq = seq[0:len(subKeys)]
    
                        seqList.append(subseq)
                        num, den = 1, 1
    
                        for j, s in enumerate(subseq):
    
                            num *= dataArray[s][subKeys[j]]
    
                            den *= value[s]
    
                            numList.append(num)
    
                            denList.append(den)
    
                            seq = seq[1:]
    
     
    
                            resultList = []
    
                            for i, num in enumerate(numList):
            
                                resultList.append(log10(num/denList[i]))
                        indxArray+=1
                    indxSeq +=1
                    
                    outStr = '\n'.join(['Sequence = %s Calculation = %0.12f' % (seqList[i], res) for i, res in enumerate(resultList)])
                    print 'Array set # = %d\nSequence set # = %d' % (arraySet, seqSet)
                    print 'Sequence Header: %s' % dataSeq[0]
                    print outStr

    Comment

    • bvdet
      Recognized Expert Specialist
      • Oct 2006
      • 2851

      #62
      Let's make a new function, iterate on it, and write the results to a file:
      [code=Python]def compileData(fnA rray, fnSeq, arraySet=1, seqSet=1):
      # sequence factor dictionary
      value={"A":0.3, "T":0.3,"C":0.2 ,"G":0.2}

      dataArray = parseArray(fnAr ray, arraySet)
      if dataArray:
      dataSeq = parseData(fnSeq , seqSet)
      if not dataSeq:
      return False
      else:
      return None

      # This is the complete sequence
      seq = ''.join(dataSeq[1:])
      # These are the subkeys of dataArray - '01', '02', '03',.......... ...
      subKeys = dataArray['A'].keys()
      subKeys.sort()

      # Calculate num/den for each slice of sequence
      # Each sequence slice length = length of subKeys
      # Example:
      # seq = 'ATCGATA'
      # subKeys length = 3
      # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'
      numList = []
      denList = []
      seqList = []
      for i in xrange(len(seq) - len(subKeys) + 1):
      subseq = seq[0:len(subKeys)]
      seqList.append( subseq)
      num, den = 1, 1
      for j, s in enumerate(subse q):
      num *= dataArray[s][subKeys[j]]
      den *= value[s]
      numList.append( num)
      denList.append( den)
      seq = seq[1:]

      resultList = []
      for i, num in enumerate(numLi st):
      resultList.appe nd(num/denList[i])

      outStr = '\n'.join(['Sequence = %s Calculation = %0.12f' % (seqList[i], res) for i, res in enumerate(resul tList)])
      return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)

      if __name__ == '__main__':

      fnArray = 'array.txt'
      fnSeq = 'seq.txt'

      outputfile = 'sequence_calc_ data.txt'

      arraySet = 1
      outList = []
      calcdata = 1
      while not calcdata is None:
      seqSet = 1
      while True:
      calcdata = compileData(fnA rray, fnSeq, arraySet, seqSet)
      if calcdata:
      outList.append( calcdata)
      seqSet += 1
      else:
      break
      arraySet += 1

      f = open(outputfile , 'w')
      f.write('\n'.jo in(outList))
      f.close() [/code]This resulted in a 3.1 mb file. Following are the first few lines of the first and last compilation:
      Code:
      Array set # = 1
      Sequence set # = 1
      Sequence Header: >CG9571_O-E|Drosophila melanogaster|CG9571|FBgn0031086|X:19926374..19927133
      
      Sequence = CCAGTCCACCGGCCGC Calculation = 0.000025722315
      Sequence = CAGTCCACCGGCCGCC Calculation = 0.000000000318
      Sequence = AGTCCACCGGCCGCCG Calculation = 0.000595631200
      Sequence = GTCCACCGGCCGCCGA Calculation = 0.000120125057
      Sequence = TCCACCGGCCGCCGAT Calculation = 0.000000089016
      ...........................
      Array set # = 4
      Sequence set # = 8
      Sequence Header: >Obp19b_prom|Drosophila melanogaster|Obp19b|FBgn0031110|X:20224439..20227440
      
      Sequence = ATTGCTGACGGGTCGA Calculation = 0.000005535136
      Sequence = TTGCTGACGGGTCGAA Calculation = 0.000003984295
      Sequence = TGCTGACGGGTCGAAT Calculation = 0.000053179344
      Sequence = GCTGACGGGTCGAATG Calculation = 0.000031549069
      .............................

      Comment

      • aboxylica
        New Member
        • Jul 2007
        • 111

        #63
        THis is the code.my o/p is an empty array.why is this happening?

        Code:
        from math import *
        def parseArray(fn, dataset=1, key='PO', term='/'):
        
            '''
        
            Read a formatted data file in matrix format and
        
            compile data into a dictionary
        
            '''
        
            f = open(fn)
        
         
        
            # skip to required data set
        
            for _ in range(dataset):
            
        
                try:
        
                    line = f.next()
        
                    while not line.startswith(key):
        
                        line = f.next()
        
                except StopIteration, e:
        
                    print 'We have reached the end of the file!'
        
                    f.close()
        
                    return False
        
         
        
            headerList = line.strip().split()[1:]
        
            lineList = []
        
         
        
            line = f.next().strip()
        
            while not line.startswith(term):
        
                if line != '':
        
                    lineList.append(line.strip().split())
        
                line = f.next().strip()
        
         
        
            f.close()
        
         
        
            # Key list
        
            keys = [i[0] for i in lineList]
        
            # Values list
        
            values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]
        
         
        
            # Create a dictionary from keys and values
        
            lineDict = dict(zip(keys, values))
        
         
        
            dataDict = {}
        
         
        
            for i, item in enumerate(headerList):
        
                dataDict[item] = {}
        
                for key in lineDict:
        
                    dataDict[item][key] = lineDict[key][i]
        
         
        
            # Add 1.0 to every element in dataDict subdictionaries
        
            for keyMain in dataDict:
        
                for keySub in dataDict[keyMain]:
        
                    dataDict[keyMain][keySub] += 1.0
        
         
        
            # Normalize original data (with 1 added) and update data
        
            valueSums = [sum(item)+4 for item in values]
        
            # print valueSums
        
         
        
            for keyMain in dataDict:
        
                for keySub in dataDict[keyMain]:
                    dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]
        
         
            return dataDict
        
         
        
         
        
        def parseData(fn, dataset=1, key='>'):
        
            '''
        
            Read a formatted data file of sequences
        
            Return a list of sequences
        
            The first element in the list is the header
        
            '''   
        
            # initialize output list
        
            dataList = []
        
           
        
            # open file for reading
        
            f = open(fn)
        
           
        
            # skip to required data set
        
            for _ in range(dataset):
        
                try:
        
                    s = f.next()
        
                    while not s.startswith(key):
        
                        s = f.next()
        
                except StopIteration, e:
        
                    print 'We have reached the end of the file!'
        
                    f.close()
        
                    return False
        
         
        
            # initialize output list
        
            dataList = [s,]
        
               
            for line in f:
        
                if not line.startswith(key):
        
                    dataList.append(line.strip())
        
                else:
        
                    break
        
         
        
            f.close()
        
            return dataList
        
         
        
           
        def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
           
            # sequence factor dictionary
           
            value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
           
                 
           
            dataArray = parseArray(fnArray, arraySet)
           
            if dataArray:
        
                dataSeq = parseData(fnSeq, seqSet)
           
                if not dataSeq:
           
                    return False
          
                else:
          
                    return None
          
                 
          
                # This is the complete sequence 
          
                seq = ''.join(dataSeq[1:])
        
                # These are the subkeys of dataArray - '01', '02', '03',.............
          
                subKeys = dataArray['A'].keys()
        
                subKeys.sort()
          
               
          
                # Calculate num/den for each slice of sequence
          
                  # Each sequence slice length = length of subKeys
          
                  # Example:
                    # seq = 'ATCGATA'
          
                  # subKeys length = 3
          
                  # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'
        
                numList = []
          
                denList = []
          
                seqList = []
          
                for i in xrange(len(seq) - len(subKeys) + 1):
          
                    subseq = seq[0:len(subKeys)]
          
                    seqList.append(subseq)
          
                    num, den = 1, 1
          
                    for j, s in enumerate(subseq):
          
                        num *= dataArray[s][subKeys[j]]
          
                        den *= value[s]
          
                        numList.append(num)
          
                        denList.append(den)
          
                        seq = seq[1:]
          
               
          
                resultList = []
          
                for i, num in enumerate(numList):
        
                    resultList.append(num/denList[i])
          
               
          
                    outStr = '\n'.join(['Sequence = %s Calculation = %0.12f' % (seqList[i], res)   for i, res in enumerate(resultList)])
          
                    return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
          
        if __name__ == '__main__':
          
                 
            fnArray =r'all_redfly.transfac' 
            fnSeq = r'redfly_sequence.fasta'
          
            outputfile =  "sequence_calc_data.txt"
          
                 
          
            arraySet = 1
          
            outList = []
          
            calcdata = 1
          
            while not calcdata is None:
          
                seqSet = 1
          
                while True:
          
                    calcdata = compileData(fnArray, fnSeq, arraySet, seqSet)
                    print calcdata
          
                    if calcdata:
          
                        outList.append(calcdata)
          
                        seqSet += 1
          
                    else:
          
                        break
          
                arraySet += 1
                
          
               
          
            f = open(outputfile, 'w')
          
            f.write('\n'.join(outList))
          
            f.close()
            f=open(outputfile,"r")
            file_con=f.readlines()
            print file_con
            for line in file_con:
                print line

        Comment

        • aboxylica
          New Member
          • Jul 2007
          • 111

          #64
          I seem to get an list index out of range error:
          Traceback (most recent call last):
          File "newbie1.py ", line 311, in <module>
          calcdata = compileData(fnA rray, fnSeq, arraySet, seqSet)
          File "newbie1.py ", line 285, in compileData
          outStr = '\n'.join(['Sequence = %s Calculation = %0.12f' % (seqList[i], res)for i, res in enumerate(resul tList)])
          IndexError: list index out of range
          Code:
          from math import *
          def parseArray(fn, dataset=1, key='PO', term='/'):
          
              '''
          
              Read a formatted data file in matrix format and
          
              compile data into a dictionary
          
              '''
          
              f = open(fn)
          
           
          
              # skip to required data set
          
              for _ in range(dataset):
              
          
                  try:
          
                      line = f.next()
          
                      while not line.startswith(key):
          
                          line = f.next()
          
                  except StopIteration, e:
          
                      print 'We have reached the end of the file!'
          
                      f.close()
          
                      return False
          
           
          
              headerList = line.strip().split()[1:]
              
          
              lineList = []
          
           
          
              line = f.next().strip()
          
              while not line.startswith(term):
          
                  if line != '':
          
                      lineList.append(line.strip().split())
          
          
                  line = f.next().strip()
          
           
          
              f.close()
          
           
          
              # Key list
          
              keys = [i[0] for i in lineList]
          
              # Values list
          
              values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]
          
           
          
              # Create a dictionary from keys and values
          
              lineDict = dict(zip(keys, values))
          
           
          
              dataDict = {}
          
           
          
              for i, item in enumerate(headerList):
          
                  dataDict[item] = {}
          
                  for key in lineDict:
          
                      dataDict[item][key] = lineDict[key][i]
          
           
          
              # Add 1.0 to every element in dataDict subdictionaries
          
              for keyMain in dataDict:
          
                  for keySub in dataDict[keyMain]:
          
                      dataDict[keyMain][keySub] += 1.0
          
           
          
              # Normalize original data (with 1 added) and update data
          
              valueSums = [sum(item)+4 for item in values]
          
              # print valueSums
          
           
          
              for keyMain in dataDict:
          
                  for keySub in dataDict[keyMain]:
                      dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]
          
              return dataDict
          
           
          
           
          
          def parseData(fn, dataset=1, key='>'):
          
              '''
          
              Read a formatted data file of sequences
          
              Return a list of sequences
          
              The first element in the list is the header
          
              '''   
          
              # initialize output list
          
              dataList = []
          
             
          
              # open file for reading
          
              f = open(fn)
          
             
          
              # skip to required data set
          
              for _ in range(dataset):
          
          
                  try:
          
                      s = f.next()
          
                      while not s.startswith(key):
                      
          
                          s = f.next()
          
                  except StopIteration, e:
          
                      print 'We have reached the end of the file!'
          
                      f.close()
          
                      return False
          
           
          
              # initialize output list
          
              dataList = [s,]
          
                 
              for line in f:
          
                  if not line.startswith(key):
          
                      dataList.append(line.strip())
          
                  else:
          
                      break
          
           
          
              f.close()
          
              return dataList
          
          
           
          
             
          def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
             
              # sequence factor dictionary
             
              value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
             
                   
             
              dataArray = parseArray(fnArray, arraySet)
             
              if dataArray:
          
                  dataSeq = parseData(fnSeq, seqSet)
          
             
                  if not dataSeq:
             
                      return False
            
              else:
            
                  return None
              
            
                   
            
              # This is the complete sequence 
            
              seq = ''.join(dataSeq[1:])
              
          
          
              # These are the subkeys of dataArray - '01', '02', '03',.............
            
              subKeys = dataArray['A'].keys()
          
              subKeys.sort()
              
            
                 
            
              # Calculate num/den for each slice of sequence
            
              # Each sequence slice length = length of subKeys
            
              # Example:
              # seq = 'ATCGATA'
            
              # subKeys length = 3
            
              # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'
          
              numList = []
            
              denList = []
            
              seqList = []
            
              for i in xrange(len(seq) - len(subKeys) + 1):
            
                  subseq = seq[0:len(subKeys)]
            
                  seqList.append(subseq)
          
            
                  num, den = 1, 1
            
                  for j, s in enumerate(subseq):
            
                      num *= dataArray[s][subKeys[j]]
            
                      den *= value[s]
            
                      numList.append(num)
            
                      denList.append(den)
            
                      seq = seq[1:]
            
                 
            
              resultList = []
            
              for i, num in enumerate(numList):
          
                  resultList.append(log10(num/denList[i]))
                  print (resultList)
            
                 
            
              outStr = '\n'.join(['Sequence = %s Calculation = %0.12f' % (seqList[i], res)for i, res in enumerate(resultList)])
            
              return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
            
          if __name__ == '__main__':
            
          
              fnArray ='all_redfly.transfac' 
              fnSeq = 'redfly_sequence.fasta'
            
              outputfile =  "sequence_calc_data.txt"
            
                   
            
              arraySet = 1
            
              outList = []
            
              calcdata = 1
            
              while not calcdata is None:
            
                  seqSet = 1
            
                  while True:
            
                      calcdata = compileData(fnArray, fnSeq, arraySet, seqSet)
            
                      if calcdata:
            
                          outList.append(calcdata)
            
                          seqSet += 1
            
                      else:
            
                          break
            
                  arraySet += 1
          
                  
            
                 
            
              f = open(outputfile, 'w')
            
              f.write('\n'.join(outList))
            
              f.close()
              f=open(outputfile,"r")
              file_con=f.readlines()
              print file_con
              for line in file_con:
                  print line
          waiting for ur reply,
          cheers!

          Comment

          • bvdet
            Recognized Expert Specialist
            • Oct 2006
            • 2851

            #65
            I am not sure why you add so many spaces in between the lines of code. I personally find it unreadable. Anyway, when you were adding all the spaces, some of the code ended up at the incorrect indentation:[code=Python]........for j, s in enumerate(subse q):


            num *= dataArray[s][subKeys[j]]


            den *= value[s]


            numList.append( num)


            denList.append( den)


            seq = seq[1:]
            [/code]SHOULD BE:[code=Python]........for j, s in enumerate(subse q):
            num *= dataArray[s][subKeys[j]]
            den *= value[s]
            numList.append( num)
            denList.append( den)
            seq = seq[1:][/code]

            Comment

            • aboxylica
              New Member
              • Jul 2007
              • 111

              #66
              hey,
              That was the mistake.amazing !! thanks a million!!
              I got some doubts about the program.
              i have some doubts. first understandingan d then get back to you.
              THANKS A MILLION!
              cheers!!

              Comment

              • aboxylica
                New Member
                • Jul 2007
                • 111

                #67
                hey,
                here is the code where I tried removing the try catch block and couple of things which will make it easier for me to understand.but looks like there is some problem ..I will of course use them in my main program.But I was just trying to understand when I tried executing the iteration was not happening and when I said
                print outList instead of storing it in a file it was not iterating.This is the code
                can you tell me whats happening???
                Code:
                from math import *
                def parseArray(fn,dataset=1,key='PO',term='/'):
                    f=open(fn)
                    for _ in range(dataset):
                        line=f.next()
                        while not line.startswith(key):
                            line=f.next()
                    headerList=line.strip().split()[1:]
                    lineList=[]
                    line=f.next().strip()
                    while not line.startswith(term):
                        if line!='':
                            lineList.append(line.strip().split())
                        line=f.next().strip()
                        # f.close()
                    keys=[i[0] for i in lineList]
                    values=[[float(s) for s in item] for item in [j[1:] for j in lineList]]
                    lineDict=dict(zip(keys,values))
                    dataDict={}
                    for i,item in enumerate(headerList):
                        dataDict[item]={}
                        for key in lineDict:
                            dataDict[item][key]=lineDict[key][i]
                    for keyMain in dataDict:
                        for keySub in dataDict[keyMain]:
                            dataDict[keyMain][keySub]+=1.0
                    valueSums=[sum(item)+4 for item in values]
                    for keyMain in dataDict:
                        for keySub in dataDict[keyMain]:
                            dataDict[keyMain][keySub]/=valueSums[int(keySub)-1]
                    return dataDict
                #fn="weight_matrix.transfac.txt"
                #p=parseArray(fn)
                #print p
                def parseData(fn,dataset=1,key='>'):
                    dataList=[]
                    f=open(fn)
                    for _ in range(dataset):
                        s=f.next()
                    dataList=[s,]
                    
                    for line in f:
                        if not line.startswith(key):
                            dataList.append(line.strip())
                        else:
                            break
                    return dataList
                #fn="redfly_sequence.fasta"
                #p=parseData(fn)
                #print p
                def compileData(fnArray,fnSeq,arraySet=1,seqSet=1):
                    value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
                    dataArray=parseArray(fnArray,arraySet)
                    if dataArray:
                        dataSeq=parseData(fnSeq,seqSet)
                    seq=''.join(dataSeq[1:])
                    subKeys=dataArray['A'].keys()
                    subKeys.sort()
                    numList=[]
                    denList=[]
                    seqList=[]
                    for i in xrange(len(seq)-len(subKeys)):
                        subseq=seq[0:len(subKeys)]
                        seqList.append(subseq)
                        num,den=1,1
                        for j,s in enumerate(subseq):
                            num*=dataArray[s][subKeys[j]]
                            den*=value[s]
                        numList.append(num)
                        denList.append(den)
                        seq=seq[1:]
                    resultList=[]
                    for i,num in enumerate(numList):
                        if (log10(num/denList[i]))>2:
                            resultList.append(log10(num/denList[i]))
                    outStr='\n'.join(['sequence=%s Calculation=%0.12f'%(seqList[i],res) for i,res in enumerate(resultList)])
                    return 'array set#= %d\nSequence set #=%d\nSequence Header: %s\n%s' %(arraySet,seqSet,dataSeq[0],outStr)
                fnArray='weight_matrix.transfac.txt'
                fnSeq='redfly_sequence.fasta'
                arraySet=1
                outList=[]
                calcdata=1
                while not calcdata is None:
                    seqSet=1
                    while True:
                        calcdata=compileData(fnArray,fnSeq,arraySet,seqSet)
                        if calcdata:
                            outList.append(calcdata)
                            
                    
                            seqSet+=1
                        else:
                            break
                        
                    arraySet+=1
                print outList
                f=open(outputfile,'w')
                f.write('/n'.join(outList))
                f.close()
                waiting
                cheers!!

                Comment

                • bvdet
                  Recognized Expert Specialist
                  • Oct 2006
                  • 2851

                  #68
                  After running the script, I can do this:[code=Python]>>> print outList[1]
                  Array set # = 1
                  Sequence set # = 2
                  Sequence Header: >Cp36_DRR|Droso phila melanogaster|Cp 36|FBgn0000359| X:8323349..8324 136

                  Sequence = AGTCGACCAGCACGA G Calculation = -0.872390330485
                  Sequence = GTCGACCAGCACGAG A Calculation = -3.287525755636
                  Sequence = TCGACCAGCACGAGA T Calculation = -4.346213357398
                  Sequence = CGACCAGCACGAGAT C Calculation = -2.329064001005
                  ............... ..........[/code]I don't want to print the entire outList because it's over 3 MB.
                  You may have changed something you should not have. Maybe you should copy the code again. If you need to change things, change only one thing at a time and test to make sure it still works.

                  Comment

                  • aboxylica
                    New Member
                    • Jul 2007
                    • 111

                    #69
                    hello!
                    I hope you people remember the problem above..
                    i got little problems with that
                    that was just opening a file containing files..now il be opening a directory containing different sequence files
                    this is how the code looks now!
                    am trying to change the i/p file to folder by showing the path of the folder but its going to the exception file..can you tell me why?
                    Code:
                    from math import *
                    def parseArray(fn, dataset=1, key='PO', term='/'):
                    
                        '''
                    
                        Read a formatted data file in matrix format and
                    
                        compile data into a dictionary
                    
                        '''
                    
                        f = open(fn)
                    
                     
                    
                        # skip to required data set
                    
                        for _ in range(dataset):
                        
                    
                            try:
                    
                                line = f.next()
                    
                                while not line.startswith(key):
                    
                                    line = f.next()
                    
                            except StopIteration, e:
                    
                                print 'We have reached the end of the file!'
                    
                                f.close()
                    
                                return False
                    
                     
                    
                        headerList = line.strip().split()[1:]
                        
                    
                        lineList = []
                    
                     
                    
                        line = f.next().strip()
                    
                        while not line.startswith(term):
                    
                            if line != '':
                    
                                lineList.append(line.strip().split())
                    
                    
                            line = f.next().strip()
                    
                     
                    
                        f.close()
                    
                     
                    
                        # Key list
                    
                        keys = [i[0] for i in lineList]
                    
                        # Values list
                    
                        values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]
                    
                     
                    
                        # Create a dictionary from keys and values
                    
                        lineDict = dict(zip(keys, values))
                    
                     
                    
                        dataDict = {}
                    
                     
                    
                        for i, item in enumerate(headerList):
                    
                            dataDict[item] = {}
                    
                            for key in lineDict:
                    
                                dataDict[item][key] = lineDict[key][i]
                    
                     
                    
                        # Add 1.0 to every element in dataDict subdictionaries
                    
                        for keyMain in dataDict:
                    
                            for keySub in dataDict[keyMain]:
                    
                                dataDict[keyMain][keySub] += 1.0
                    
                     
                    
                        # Normalize original data (with 1 added) and update data
                    
                        valueSums = [sum(item)+4 for item in values]
                    
                        # print valueSums
                    
                     
                    
                        for keyMain in dataDict:
                    
                            for keySub in dataDict[keyMain]:
                                dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]
                    
                        return dataDict
                    
                     
                    
                     
                    
                    def parseData(fn, dataset=1, key='>'):
                    
                        '''
                    
                        Read a formatted data file of sequences
                    
                        Return a list of sequences
                    
                        The first element in the list is the header
                    
                        '''   
                    
                        # initialize output list
                    
                        dataList = []
                    
                       
                    
                        # open file for reading
                    
                        f = open(fn)
                    
                       
                    
                        # skip to required data set
                    
                        for _ in range(dataset):
                    
                    
                            try:
                    
                                s = f.next()
                    
                                while not s.startswith(key):
                                
                    
                                    s = f.next()
                    
                            except StopIteration, e:
                    
                                print 'We have reached the end of the file!'
                    
                                f.close()
                    
                                return False
                    
                     
                    
                        # initialize output list
                    
                        dataList = [s,]
                    
                           
                        for line in f:
                    
                            if not line.startswith(key):
                    
                                dataList.append(line.strip())
                    
                            else:
                    
                                break
                    
                     
                    
                        f.close()
                    
                        return dataList
                    
                    
                     
                    
                       
                    def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
                       
                        # sequence factor dictionary
                       
                        value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
                       
                             
                       
                        dataArray = parseArray(fnArray, arraySet)
                    
                       
                        if dataArray:
                    
                            dataSeq = parseData(fnSeq, seqSet)
                    
                       
                            if not dataSeq:
                       
                                return False
                      
                        else:
                      
                            return None
                        
                      
                             
                      
                        # This is the complete sequence 
                      
                        seq = ''.join(dataSeq[1:])
                    
                        
                        
                    
                    
                        # These are the subkeys of dataArray - '01', '02', '03',.............
                      
                        subKeys = dataArray['A'].keys()
                    
                        subKeys.sort()
                    
                        
                      
                           
                      
                        # Calculate num/den for each slice of sequence
                      
                        # Each sequence slice length = length of subKeys
                      
                        # Example:
                        # seq = 'ATCGATA'
                      
                        # subKeys length = 3
                      
                        # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'
                    
                        numList = []
                      
                        denList = []
                      
                        seqList = []
                      
                        for i in xrange(len(seq) - len(subKeys)):
                      
                            subseq = seq[0:len(subKeys)]
                      
                            seqList.append(subseq)
                    
                      
                            num, den = 1, 1
                      
                            for j, s in enumerate(subseq):
                      
                                num *= dataArray[s][subKeys[j]]
                      
                                den *= value[s]
                      
                            numList.append(num)
                      
                            denList.append(den)
                      
                            seq = seq[1:]
                      
                           
                        
                        resultList = []
                      
                        for i, num in enumerate(numList):
                            
                            if (log10(num/denList[i]))>=2:
                              
                    	    resultList.append(int(abs(1)))
                       
                            
                       
                           
                      
                        outStr = '\n'.join(['Sequence = %s Calculation = %d' % (seqList[i], res) for i, res in enumerate(resultList)])
                        
                        
                      
                        return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
                        
                      
                    if __name__ == '__main__':
                        
                      
                    
                        fnArray ='half.txt'
                       
                        fnSeq = 'C:\\python25\ding\YAL005C.txt'
                        
                        
                      
                        outputfile =  "sequence_calc_data.txt"
                      
                             
                      
                        arraySet = 1
                      
                        outList = []
                      
                        calcdata = 1
                      
                        while not calcdata is None:
                      
                            seqSet = 1
                      
                            while True:
                      
                                calcdata = compileData(fnArray, fnSeq, arraySet, seqSet)
                                print calcdata
                      
                                if calcdata:
                      
                                    outList.append(calcdata)
                      
                                    seqSet += 1
                      
                                else:
                      
                                    break
                      
                            arraySet += 1
                    
                            
                      
                           
                      
                        f = open(outputfile, 'w')
                      
                        f.write('\n'.join(outList))
                      
                        f.close()
                        #f=open(outputfile,"r")
                        #file_con=f.readlines()
                        #for line in file_con:
                         #   print line
                    please tell me what can i do??

                    Comment

                    • aboxylica
                      New Member
                      • Jul 2007
                      • 111

                      #70
                      here is my code which is reading a directory containing files..... it seems to go to the exception part always.. i dono why..i think it checks for the first file in the folder and then comes out..how do i check if its going to all the files..
                      Code:
                      from math import *
                      def parseArray(fn, dataset=1, key='PO', term='/'):
                      
                          '''
                      
                          Read a formatted data file in matrix format and
                      
                          compile data into a dictionary
                      
                          '''
                      
                          f = open(fn)
                      
                       
                      
                          # skip to required data set
                      
                          for _ in range(dataset):
                          
                      
                              try:
                      
                                  line = f.next()
                                  print "am here"
                      
                                  while not line.startswith(key):
                                      print "oh yes"
                      
                                      line = f.next()
                      
                              except StopIteration, e:
                                  print '###############################'
                      
                                  print 'We have reached the end of the file!'
                      
                                  f.close()
                      
                                  return False
                      
                       
                      
                          headerList = line.strip().split()[1:]
                          
                      
                          lineList = []
                      
                       
                      
                          line = f.next().strip()
                      
                          while not line.startswith(term):
                      
                              if line != '':
                      
                                  lineList.append(line.strip().split())
                      
                      
                              line = f.next().strip()
                      
                       
                      
                          f.close()
                      
                       
                      
                          # Key list
                      
                          keys = [i[0] for i in lineList]
                      
                          # Values list
                      
                          values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]
                      
                       
                      
                          # Create a dictionary from keys and values
                      
                          lineDict = dict(zip(keys, values))
                      
                       
                      
                          dataDict = {}
                      
                       
                      
                          for i, item in enumerate(headerList):
                      
                              dataDict[item] = {}
                      
                              for key in lineDict:
                      
                                  dataDict[item][key] = lineDict[key][i]
                      
                       
                      
                          # Add 1.0 to every element in dataDict subdictionaries
                      
                          for keyMain in dataDict:
                      
                              for keySub in dataDict[keyMain]:
                      
                                  dataDict[keyMain][keySub] += 1.0
                      
                       
                      
                          # Normalize original data (with 1 added) and update data
                      
                          valueSums = [sum(item)+4 for item in values]
                      
                          # print valueSums
                      
                       
                      
                          for keyMain in dataDict:
                      
                              for keySub in dataDict[keyMain]:
                                  dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]
                      
                          return dataDict
                      
                       
                      
                       
                      
                      def parseData(fn, dataset=1, key='>'):
                      
                          '''
                      
                          Read a formatted data file of sequences
                      
                          Return a list of sequences
                      
                          The first element in the list is the header
                      
                          '''   
                      
                          # initialize output list
                      
                          dataList = []
                      
                         
                      
                          # open file for reading
                      
                          f = open(fn)
                      
                         
                      
                          # skip to required data set
                      
                          for _ in range(dataset):
                      
                      
                              try:
                      
                                  s = f.next()
                      
                                  while not s.startswith(key):
                                  
                      
                                      s = f.next()
                      
                              except StopIteration, e:
                      
                                  print 'We have reached the end of the file!'
                                  print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@'
                      
                                  f.close()
                      
                                  return False
                      
                       
                      
                          # initialize output list
                      
                          dataList = [s,]
                      
                             
                          for line in f:
                      
                              if not line.startswith(key):
                      
                                  dataList.append(line.strip())
                      
                              else:
                      
                                  break
                      
                       
                      
                          f.close()
                      
                          return dataList
                      
                      
                       
                      
                         
                      def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
                         
                          # sequence factor dictionary
                         
                          value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
                         
                               
                         
                          dataArray = parseArray(fnArray, arraySet)
                      
                         
                          if dataArray:
                      
                              dataSeq = parseData(fnSeq, seqSet)
                      
                         
                              if not dataSeq:
                         
                                  return False
                        
                          else:
                        
                              return None
                          
                        
                               
                        
                          # This is the complete sequence 
                        
                          seq = ''.join(dataSeq[1:])
                      
                          
                          
                      
                      
                          # These are the subkeys of dataArray - '01', '02', '03',.............
                        
                          subKeys = dataArray['A'].keys()
                      
                          subKeys.sort()
                      
                          
                        
                             
                        
                          # Calculate num/den for each slice of sequence
                        
                          # Each sequence slice length = length of subKeys
                        
                          # Example:
                          # seq = 'ATCGATA'
                        
                          # subKeys length = 3
                        
                          # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'
                      
                          numList = []
                        
                          denList = []
                        
                          seqList = []
                        
                          for i in xrange(len(seq) - len(subKeys)):
                        
                              subseq = seq[0:len(subKeys)]
                        
                              seqList.append(subseq)
                      
                        
                              num, den = 1, 1
                        
                              for j, s in enumerate(subseq):
                        
                                  num *= dataArray[s][subKeys[j]]
                        
                                  den *= value[s]
                        
                              numList.append(num)
                        
                              denList.append(den)
                        
                              seq = seq[1:]
                        
                             
                          
                          resultList = []
                        
                          for i, num in enumerate(numList):
                              #p=log10(num/denList[i])
                              #if (p) >=2:
                                  #print "#########",abs(int(p))
                              if (log10(num/denList[i]))>=2:
                                  #print "i am here"
                      	    resultList.append(int(abs(1)))
                          #print resultList
                          #for i in resultList:
                      	#mean=sum(resultList)/len(resultList)
                              #sub=mean-i
                              #queue = []
                              #queue = (sub)**2
                              #print sqrt(queue/len(resultList))
                      	
                          #print mean,"@@@@@@@@@@"
                      	
                              
                         
                             
                        
                          outStr = '\n'.join(['Sequence = %s Calculation = %d' % (seqList[i], res) for i, res in enumerate(resultList)])
                          #print "this is line 294"
                          
                        
                          return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
                          
                        
                      if __name__ == '__main__':
                          
                        
                      
                          fnArray ='C:\\python25\\half.txt'
                          import os
                          seq_=os.listdir("ding")
                          print seq_
                          os.chdir("C:\\python25\\New Folder")
                          for file_ in seq_:
                              if os.path.isfile(file_):
                                  rem=open(file_)
                                  dingg=rem.readlines()
                          fnSeq = dingg
                          
                          
                        
                          outputfile =  "sequence_calc_data.txt"
                        
                               
                        
                          arraySet = 1
                        
                          outList = []
                        
                          calcdata = 1
                        
                          while not calcdata is None:
                        
                              seqSet = 1
                        
                              while True:
                        
                                  calcdata = compileData(fnArray, fnSeq, arraySet, seqSet)
                                  print calcdata
                        
                                  if calcdata:
                        
                                      outList.append(calcdata)
                        
                                      seqSet += 1
                        
                                  else:
                        
                                      break
                        
                              arraySet += 1
                      
                              
                        
                             
                        
                          f = open(outputfile, 'w')
                        
                          f.write('\n'.join(outList))
                        
                          f.close()
                          #f=open(outputfile,"r")
                          #file_con=f.readlines()
                          #for line in file_con:
                           #   print line
                      waiting for ur reply,
                      cheers!

                      Comment

                      • bvdet
                        Recognized Expert Specialist
                        • Oct 2006
                        • 2851

                        #71
                        What error message are you receiving? It appears that you are trying to read data in from the files in a directory. You should be passing the file name to your functions, not the text from one of the files. Try something like this:[code=Python]import os
                        # Create a list of directory entries
                        dir_name = 'your_directory '
                        fList = os.listdir(dir_ name)
                        # The directory entries do not include the full path
                        # Create a list of file names with full path, eliminating subdirectory entries
                        fList1 = [os.path.join(di r_name, f) for f in fList if os.path.isfile( os.path.join(di r_name, f))]
                        seqSetIndex = 0
                        fnSeq = fList1[seqSetIndex]
                        while True:
                        ... do stuff ...
                        seqSetIndex += 1[/code]'fnSeq' is the file name will full path that is passed to your parsing function.

                        Comment

                        • aboxylica
                          New Member
                          • Jul 2007
                          • 111

                          #72
                          to show you typically how my files are.. I am storing them inside python 25 folder..the folder is called New folder which has around five thousand files..so i cant possibly give the names of all the files. for eg YAL001C,YAL002w ..etc
                          each file has sequence like this:
                          >Scer_YAL001C SGDID:S0000001 5' untranslated region, Chr I 151168 - 152167 (revcom), 1000 bp
                          ACTTGTAAATATATC TTTTATTTTCCGAGA GGAAAAAGTTTCAAA AAAAAAAAAAAAAAA AGAAGAAAAATAACT TTCTCATGTAATAAA AGGTAACTAATGTAG ACAAAAAAGTATACA TTTAGCTTTTCTTTT TTTGATGATTTTTGA GTTTCATGTTACTAA TCAGAACAATTAACG ACACCTTCATTATGA AAAAATTAATTAGCT ATAAGTCTTCGAAGT AGAACATGATATTTG GCAATCACTCGAATA ACTATCTTAATTTAC CTGCTGAAATAATTT GAAAAAACACCCGAG GCAGCAGACGAAAGG TGTTTTTGCTAAACA ATGATTGATTTCTGG CGCCATTTCTACATT CTGAACAGTTCATCT CATTTCAGTAACAGT ACTTCAATGGAATAT TTATTAAAGAAAGTG CTTAAAAAAGTATTA TAAAACGATACATGG ACTGACTCAAGATTG AGCTAATAAGGTCCA CCGCCTAGTGCTTAA GAGTTCTGTACCACT ATAATAATTTATCTT GATCGTATTATGTGT AAAAAAAAGGCGCTT GAAATGAAAGCTCCG AAAATTAAAATACTT TGACTGCTTCGGAAA ACAAAAACATATAAA TAAATTTAAAAAATA AACTGTAAAATATTT AAAAACTATTAAAAA TATTTTATATTTTTA AAATTATTTATTATT ATGTCATGTGACAAG ACTTAAATCATTACA TAAAAGGTTTTGAAG TTCAATGTCAAAGTC AATATAATAAGCATA CTAAGGCACACTTAT GCAAATCGAGTTATT GAAGCTGGTAAAATT ATAAGATTTTTATTT TTATTTCTTTTATTT CTGCAAATCTGCATT TTCAAATACCGCTTG GTTTTTTGCATCATA AAGGGCGGCGCTTTC AGTCGCGAAAGTGAA ATAAACAACCAGTCA CACATATAACTTTCT TCTTGCCATAAGAGA GAAGAGGACGTTTGG TTGAAGCCAACTAGC CACAAGAAAA
                          >Spar YAL001C c218:24375..253 75
                          TCAAGAGGGTTATTA TATACCGATATTTGA ATCCACACATGATCG AACTAATATAATTCA CTACTTAGCTGCTTA ACCATTCTTTGCCAT TATAATAAATGTATC TCGATCGAATGCTGC ATAGAGAAAAGCGCT TAAAAAAGTGTTCCG AAATATAACATATTT TAAACGTTTCGGAAA CCAAAAACATATATT TATTATCTTAAAGTA ATCTAAAAAATGAAA GAAACTTTGATATAT TTAAAACATAATATA ATTATTTTATAAAAG TAACATGTGATTAAA CTCACAAAGCCTAAA AATGTTTTTAATCAT TATGTTAAGCTGAAT ATAGTATATCTAATA ATTGTTTATTTAAGC AGATTGAACTGTGAA TGCTAGTAAATTTTT AAGGGTTTTTTGTTG TTCTGCTTTTCTCTA GTTTTGTAGTGTCAA ATACAACTAACTGGA TTTTTTGCATCAGAA GGGGCGCTTTAACTC GCGAAAGTAAAAATA AACAATTAATCACAC ATATCTTTTCCTCTT GCAGAAGCAAAGAAG AGGACGGTTGATTGA AGCAAACCAGCCACA GAAAATATGGCGTTG ACAATTTATCCTGAT GAACTGGTTAAAATA GTGTCTGATGAAATT GCATCAAATAAAGGA AGTATGTATATGCCT CATTCTTCTATTCCA TGTTCTTTTCAGGTG AGAAACGTGATATAT TGTAAGATTATTTAC TAACGACTTATTAAA GAAATTACGTTAAAT CAGCTTTGGGATATA TCTCGTAAATATTTT GATTTGTCTGACGAG AAAGTTAAACAATTT GTGCTTTCATGCGTG ATGTTGAAAAAGGAC ATCGAGGTGTACTGT GATAGCGTTATAACA ACTAAAAACGTGACA AATATTATAGACGAC ACTAGTCATTCATAC TCAGTAGGGATTACT GAGGACAGCCTGTGG ACGTTATTAACTGGA TACACAAAGAAGGAG TCAACTATCGGGAAT TCAGCATTTGA

                          similarly each file has such sequences my code has to go to all the sequences and do it for all the files.
                          when i run this code , it says

                          Microsoft Windows XP [Version 5.1.2600]
                          (C) Copyright 1985-2001 Microsoft Corp.



                          C:\Python25>pyt hon this_final.py
                          ['sequence_calc_ data.txt', 'YAL001C.fasta' , 'YAL002W.fasta' , 'YAL003W.fasta' , 'Y
                          AL005C.fasta', 'YAL007C.fasta']
                          am here
                          oh yes
                          oh yes
                          oh yes
                          oh yes
                          oh yes
                          oh yes
                          oh yes
                          oh yes
                          oh yes
                          oh yes
                          oh yes
                          oh yes
                          oh yes
                          oh yes
                          oh yes
                          oh yes
                          oh yes
                          ############### ############### #
                          We have reached the end of the file!
                          None
                          why is this happening.
                          this is my code
                          Code:
                          from math import *
                          def parseArray(fn, dataset=1, key='PO', term='/'):
                          
                              '''
                          
                              Read a formatted data file in matrix format and
                          
                              compile data into a dictionary
                          
                              '''
                          
                              f = open(fn)
                          
                           
                          
                              # skip to required data set
                          
                              for _ in range(dataset):
                              
                          
                                  try:
                          
                                      line = f.next()
                          
                                      while not line.startswith(key):
                          
                                          line = f.next()
                          
                                  except StopIteration, e:
                          
                                      print 'We have reached the end of the file!'
                          
                                      f.close()
                          
                                      return False
                          
                           
                          
                              headerList = line.strip().split()[1:]
                              
                          
                              lineList = []
                          
                           
                          
                              line = f.next().strip()
                          
                              while not line.startswith(term):
                          
                                  if line != '':
                          
                                      lineList.append(line.strip().split())
                          
                          
                                  line = f.next().strip()
                          
                           
                          
                              f.close()
                          
                           
                          
                              # Key list
                          
                              keys = [i[0] for i in lineList]
                          
                              # Values list
                          
                              values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]
                          
                           
                          
                              # Create a dictionary from keys and values
                          
                              lineDict = dict(zip(keys, values))
                          
                           
                          
                              dataDict = {}
                          
                           
                          
                              for i, item in enumerate(headerList):
                          
                                  dataDict[item] = {}
                          
                                  for key in lineDict:
                          
                                      dataDict[item][key] = lineDict[key][i]
                          
                           
                          
                              # Add 1.0 to every element in dataDict subdictionaries
                          
                              for keyMain in dataDict:
                          
                                  for keySub in dataDict[keyMain]:
                          
                                      dataDict[keyMain][keySub] += 1.0
                          
                           
                          
                              # Normalize original data (with 1 added) and update data
                          
                              valueSums = [sum(item)+4 for item in values]
                          
                              # print valueSums
                          
                           
                          
                              for keyMain in dataDict:
                          
                                  for keySub in dataDict[keyMain]:
                                      dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]
                          
                              return dataDict
                          
                           
                          
                           
                          
                          def parseData(fn, dataset=1, key='>'):
                          
                              '''
                          
                              Read a formatted data file of sequences
                          
                              Return a list of sequences
                          
                              The first element in the list is the header
                          
                              '''   
                          
                              # initialize output list
                          
                              dataList = []
                          
                             
                          
                              # open file for reading
                          
                              f = open(fn)
                          
                             
                          
                              # skip to required data set
                          
                              for _ in range(dataset):
                          
                          
                                  try:
                          
                                      s = f.next()
                          
                                      while not s.startswith(key):
                                      
                          
                                          s = f.next()
                          
                                  except StopIteration, e:
                          
                                      print 'We have reached the end of the file!'
                          
                                      f.close()
                          
                                      return False
                          
                           
                          
                              # initialize output list
                          
                              dataList = [s,]
                          
                                 
                              for line in f:
                          
                                  if not line.startswith(key):
                          
                                      dataList.append(line.strip())
                          
                                  else:
                          
                                      break
                          
                           
                          
                              f.close()
                          
                              return dataList
                          
                          
                           
                          
                             
                          def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
                             
                              # sequence factor dictionary
                             
                              value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
                             
                                   
                             
                              dataArray = parseArray(fnArray, arraySet)
                          
                             
                              if dataArray:
                          
                                  dataSeq = parseData(fnSeq, seqSet)
                          
                             
                                  if not dataSeq:
                             
                                      return False
                            
                              else:
                            
                                  return None
                              
                            
                                   
                            
                              # This is the complete sequence 
                            
                              seq = ''.join(dataSeq[1:])
                          
                              
                              
                          
                          
                              # These are the subkeys of dataArray - '01', '02', '03',.............
                            
                              subKeys = dataArray['A'].keys()
                          
                              subKeys.sort()
                          
                              
                            
                                 
                            
                              # Calculate num/den for each slice of sequence
                            
                              # Each sequence slice length = length of subKeys
                            
                              # Example:
                              # seq = 'ATCGATA'
                            
                              # subKeys length = 3
                            
                              # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'
                          
                              numList = []
                            
                              denList = []
                            
                              seqList = []
                            
                              for i in xrange(len(seq) - len(subKeys)):
                            
                                  subseq = seq[0:len(subKeys)]
                            
                                  seqList.append(subseq)
                          
                            
                                  num, den = 1, 1
                            
                                  for j, s in enumerate(subseq):
                            
                                      num *= dataArray[s][subKeys[j]]
                            
                                      den *= value[s]
                            
                                  numList.append(num)
                            
                                  denList.append(den)
                            
                                  seq = seq[1:]
                            
                                 
                              
                              resultList = []
                            
                              for i, num in enumerate(numList):
                                  #p=log10(num/denList[i])
                                  #if (p) >=2:
                                      #print "#########",abs(int(p))
                                  if (log10(num/denList[i]))>=2:
                                      #print "i am here"
                          	    resultList.append(int(abs(1)))
                              #print resultList
                              #for i in resultList:
                          	#mean=sum(resultList)/len(resultList)
                                  #sub=mean-i
                                  #queue = []
                                  #queue = (sub)**2
                                  #print sqrt(queue/len(resultList))
                          	
                              #print mean,"@@@@@@@@@@"
                          	
                                  
                             
                                 
                            
                              outStr = '\n'.join(['Sequence = %s Calculation = %d' % (seqList[i], res) for i, res in enumerate(resultList)])
                              #print "this is line 294"
                              
                            
                              return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
                              
                            
                          if __name__ == '__main__':
                            
                          
                              fnArray ='C:\python25\half.txt'
                              fnSeq = 'C:\python25\New Folder'
                              import os
                              dir_name='New Folder'
                              fList=os.listdir(dir_name)
                              fList1=[os.path.join(dir_name,f) for f in fList if
                              os.path.isfile(os.path.join(dir_name,f))]
                              seqSetIndex=0
                              fnSeq=fList1[seqSetIndex]
                              while True:
                                  seqSetIndex+=1
                              
                              
                            
                              outputfile =  "sequence_calc_data.txt"
                            
                                   
                            
                              arraySet = 1
                            
                              outList = []
                            
                              calcdata = 1
                            
                              while not calcdata is None:
                            
                                  seqSet = 1
                            
                                  while True:
                            
                                      calcdata = compileData(fnArray, fnSeq, arraySet, seqSet)
                                      print calcdata
                            
                                      if calcdata:
                            
                                          outList.append(calcdata)
                            
                                          seqSet += 1
                            
                                      else:
                            
                                          break
                            
                                  arraySet += 1
                          
                                  
                            
                                 
                            
                              f = open(outputfile, 'w')
                            
                              f.write('\n'.join(outList))
                            
                              f.close()
                              #f=open(outputfile,"r")
                              #file_con=f.readlines()
                              #for line in file_con:
                               #   print line

                          Comment

                          • aboxylica
                            New Member
                            • Jul 2007
                            • 111

                            #73
                            when i run the code below.. its going to an infinite loop which says "reached the end of file"
                            Code:
                            from math import *
                            
                            def parseArray(fn, dataset=1, key='PO', term='/'):
                            
                                '''
                            
                                Read a formatted data file in matrix format and
                            
                                compile data into a dictionary
                            
                                '''
                            
                                f = open(fn)
                            
                             
                            
                                # skip to required data set
                            
                                for _ in range(dataset):
                                
                            
                                    try:
                            
                                        line = f.next()
                            
                                        while not line.startswith(key):
                            
                                            line = f.next()
                            
                                    except StopIteration, e:
                            
                                        print 'We have reached the end of the file!'
                            
                                        f.close()
                            
                                        return False
                            
                             
                            
                                headerList = line.strip().split()[1:]
                                
                            
                                lineList = []
                            
                             
                            
                                line = f.next().strip()
                            
                                while not line.startswith(term):
                            
                                    if line != '':
                            
                                        lineList.append(line.strip().split())
                            
                            
                                    line = f.next().strip()
                            
                             
                            
                                f.close()
                            
                             
                            
                                # Key list
                            
                                keys = [i[0] for i in lineList]
                            
                                # Values list
                            
                                values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]
                            
                             
                            
                                # Create a dictionary from keys and values
                            
                                lineDict = dict(zip(keys, values))
                            
                             
                            
                                dataDict = {}
                            
                             
                            
                                for i, item in enumerate(headerList):
                            
                                    dataDict[item] = {}
                            
                                    for key in lineDict:
                            
                                        dataDict[item][key] = lineDict[key][i]
                            
                             
                            
                                # Add 1.0 to every element in dataDict subdictionaries
                            
                                for keyMain in dataDict:
                            
                                    for keySub in dataDict[keyMain]:
                            
                                        dataDict[keyMain][keySub] += 1.0
                            
                             
                            
                                # Normalize original data (with 1 added) and update data
                            
                                valueSums = [sum(item)+4 for item in values]
                            
                                # print valueSums
                            
                             
                            
                                for keyMain in dataDict:
                            
                                    for keySub in dataDict[keyMain]:
                                        dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]
                            
                                return dataDict
                            
                             
                            
                             
                            
                            def parseData(fn, dataset=1, key='>'):
                            
                                '''
                            
                                Read a formatted data file of sequences
                            
                                Return a list of sequences
                            
                                The first element in the list is the header
                            
                                '''   
                            
                                # initialize output list
                            
                                dataList = []
                            
                               
                            
                                # open file for reading
                            
                                f = open(fn)
                            
                               
                            
                                # skip to required data set
                            
                                for _ in range(dataset):
                            
                            
                                    try:
                            
                                        s = f.next()
                            
                                        while not s.startswith(key):
                                        
                            
                                            s = f.next()
                            
                                    except StopIteration, e:
                            
                                        print 'We have reached the end of the file!'
                            
                                        f.close()
                            
                                        return False
                            
                             
                            
                                # initialize output list
                            
                                dataList = [s,]
                            
                                   
                                for line in f:
                            
                                    if not line.startswith(key):
                            
                                        dataList.append(line.strip())
                            
                                    else:
                            
                                        break
                            
                             
                            
                                f.close()
                            
                                return dataList
                            
                            
                             
                            
                               
                            def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
                               
                                # sequence factor dictionary
                               
                                value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
                               
                                     
                               
                                dataArray = parseArray(fnArray, arraySet)
                            
                               
                                if dataArray:
                            
                                    dataSeq = parseData(fnSeq, seqSet)
                            
                               
                                    if not dataSeq:
                               
                                        return False
                              
                                else:
                              
                                    return None
                                
                              
                                     
                              
                                # This is the complete sequence 
                              
                                seq = ''.join(dataSeq[1:])
                            
                                
                                
                            
                            
                                # These are the subkeys of dataArray - '01', '02', '03',.............
                              
                                subKeys = dataArray['A'].keys()
                            
                                subKeys.sort()
                            
                                
                              
                                   
                              
                                # Calculate num/den for each slice of sequence
                              
                                # Each sequence slice length = length of subKeys
                              
                                # Example:
                                # seq = 'ATCGATA'
                              
                                # subKeys length = 3
                              
                                # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'
                            
                                numList = []
                              
                                denList = []
                              
                                seqList = []
                              
                                for i in xrange(len(seq) - len(subKeys)):
                              
                                    subseq = seq[0:len(subKeys)]
                              
                                    seqList.append(subseq)
                            
                              
                                    num, den = 1, 1
                              
                                    for j, s in enumerate(subseq):
                              
                                        num *= dataArray[s][subKeys[j]]
                              
                                        den *= value[s]
                              
                                    numList.append(num)
                              
                                    denList.append(den)
                              
                                    seq = seq[1:]
                              
                                   
                                
                                resultList = []
                              
                                for i, num in enumerate(numList):
                                    #p=log10(num/denList[i])
                                    #if (p) >=2:
                                        #print "#########",abs(int(p))
                                    if (log10(num/denList[i]))>=2:
                                        #print "i am here"
                            	    resultList.append(int(abs(1)))
                                #print resultList
                                #for i in resultList:
                            	#mean=sum(resultList)/len(resultList)
                                    #sub=mean-i
                                    #queue = []
                                    #queue = (sub)**2
                                    #print sqrt(queue/len(resultList))
                            	
                                #print mean,"@@@@@@@@@@"
                            	
                                    
                               
                                   
                              
                                outStr = '\n'.join(['Sequence = %s Calculation = %d' % (seqList[i], res) for i, res in enumerate(resultList)])
                                #print "this is line 294"
                                
                              
                                return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
                                
                              
                            if __name__ == '__main__':
                              
                            
                                fnArray ='C:\python25\half.txt'
                                fnSeq = 'C:\python25\New Folder'
                                import os
                                dir_name='New Folder'
                                fList=os.listdir(dir_name)
                                fList1=[os.path.join(dir_name,f) for f in fList if
                                os.path.isfile(os.path.join(dir_name,f))]
                                seqSetIndex=0
                                fnSeq=fList1[seqSetIndex]
                                while True:
                                    
                                
                                
                              
                                    outputfile =  "sequence_calc_data.txt"
                              
                                     
                              
                                    arraySet = 1
                              
                                    outList = []
                              
                                    calcdata = 1
                              
                                    while not calcdata is None:
                              
                                        seqSet = 1
                              
                                        while True:
                              
                                            calcdata = compileData(fnArray, fnSeq, arraySet, seqSet)
                                            print calcdata
                              
                                            if calcdata:
                              
                                                outList.append(calcdata)
                              
                                                seqSet += 1
                              
                                            else:
                              
                                                break
                              
                                    arraySet += 1
                                    seqSetIndex+=1
                            
                                    
                              
                                   
                              
                                f = open(outputfile, 'w')
                              
                                f.write('\n'.join(outList))
                              
                                f.close()
                                #f=open(outputfile,"r")
                                #file_con=f.readlines()
                                #for line in file_con:
                                 #   print line

                            Comment

                            • bvdet
                              Recognized Expert Specialist
                              • Oct 2006
                              • 2851

                              #74
                              You have two while loops. When calcData() returns None (when the end of file is reached), you issue one break statement. You may need another to get out of the other loop.

                              Comment

                              • aboxylica
                                New Member
                                • Jul 2007
                                • 111

                                #75
                                this is my code now.. i have added a break statement and still there seems to be some prob. it is coming out of the loop and saying
                                we have reached the end of file
                                None
                                here is the code
                                Code:
                                from math import *
                                def parseArray(fn, dataset=1, key='PO', term='/'):
                                
                                    '''
                                
                                    Read a formatted data file in matrix format and
                                
                                    compile data into a dictionary
                                
                                    '''
                                
                                    f = open(fn)
                                
                                 
                                
                                    # skip to required data set
                                
                                    for _ in range(dataset):
                                    
                                
                                        try:
                                
                                            line = f.next()
                                
                                            while not line.startswith(key):
                                
                                                line = f.next()
                                
                                        except StopIteration, e:
                                
                                            print 'We have reached the end of the file!'
                                
                                            f.close()
                                
                                            return False
                                
                                 
                                
                                    headerList = line.strip().split()[1:]
                                    
                                
                                    lineList = []
                                
                                 
                                
                                    line = f.next().strip()
                                
                                    while not line.startswith(term):
                                
                                        if line != '':
                                
                                            lineList.append(line.strip().split())
                                
                                
                                        line = f.next().strip()
                                
                                 
                                
                                    f.close()
                                
                                 
                                
                                    # Key list
                                
                                    keys = [i[0] for i in lineList]
                                
                                    # Values list
                                
                                    values = [[float(s) for s in item] for item in [j[1:] for j in lineList]]
                                
                                 
                                
                                    # Create a dictionary from keys and values
                                
                                    lineDict = dict(zip(keys, values))
                                
                                 
                                
                                    dataDict = {}
                                
                                 
                                
                                    for i, item in enumerate(headerList):
                                
                                        dataDict[item] = {}
                                
                                        for key in lineDict:
                                
                                            dataDict[item][key] = lineDict[key][i]
                                
                                 
                                
                                    # Add 1.0 to every element in dataDict subdictionaries
                                
                                    for keyMain in dataDict:
                                
                                        for keySub in dataDict[keyMain]:
                                
                                            dataDict[keyMain][keySub] += 1.0
                                
                                 
                                
                                    # Normalize original data (with 1 added) and update data
                                
                                    valueSums = [sum(item)+4 for item in values]
                                
                                    # print valueSums
                                
                                 
                                
                                    for keyMain in dataDict:
                                
                                        for keySub in dataDict[keyMain]:
                                            dataDict[keyMain][keySub] /= valueSums[int(keySub)-1]
                                
                                    return dataDict
                                
                                 
                                
                                 
                                
                                def parseData(fnSeq, dataset=1, key='>'):
                                
                                    '''
                                
                                    Read a formatted data file of sequences
                                
                                    Return a list of sequences
                                
                                    The first element in the list is the header
                                
                                    '''   
                                
                                    # initialize output list
                                
                                    dataList = []
                                
                                   
                                
                                    # open file for reading
                                
                                    f = open(fn)
                                
                                   
                                
                                    # skip to required data set
                                
                                    for _ in range(dataset):
                                
                                
                                        try:
                                
                                            s = f.next()
                                
                                            while not s.startswith(key):
                                            
                                
                                                s = f.next()
                                
                                        except StopIteration, e:
                                
                                            print 'We have reached the end of the file!'
                                
                                            f.close()
                                
                                            return False
                                
                                 
                                
                                    # initialize output list
                                
                                    dataList = [s,]
                                
                                       
                                    for line in f:
                                
                                        if not line.startswith(key):
                                
                                            dataList.append(line.strip())
                                
                                        else:
                                
                                            break
                                
                                 
                                
                                    f.close()
                                
                                    return dataList
                                
                                
                                 
                                
                                   
                                def compileData(fnArray, fnSeq, arraySet=1, seqSet=1):
                                   
                                    # sequence factor dictionary
                                   
                                    value={"A":0.3,"T":0.3,"C":0.2,"G":0.2}
                                   
                                         
                                   
                                    dataArray = parseArray(fnArray, arraySet)
                                
                                   
                                    if dataArray:
                                
                                        dataSeq = parseData(fnSeq, seqSet)
                                
                                   
                                        if not dataSeq:
                                   
                                            return False
                                  
                                    else:
                                  
                                        return None
                                    
                                  
                                         
                                  
                                    # This is the complete sequence 
                                  
                                    seq = ''.join(dataSeq[1:])
                                
                                    
                                    
                                
                                
                                    # These are the subkeys of dataArray - '01', '02', '03',.............
                                  
                                    subKeys = dataArray['A'].keys()
                                
                                    subKeys.sort()
                                
                                    
                                  
                                       
                                  
                                    # Calculate num/den for each slice of sequence
                                  
                                    # Each sequence slice length = length of subKeys
                                  
                                    # Example:
                                    # seq = 'ATCGATA'
                                  
                                    # subKeys length = 3
                                  
                                    # 'ATC', 'TCG', 'CGA', 'GAT', 'ATA'
                                
                                    numList = []
                                  
                                    denList = []
                                  
                                    seqList = []
                                  
                                    for i in xrange(len(seq) - len(subKeys)):
                                  
                                        subseq = seq[0:len(subKeys)]
                                  
                                        seqList.append(subseq)
                                
                                  
                                        num, den = 1, 1
                                  
                                        for j, s in enumerate(subseq):
                                  
                                            num *= dataArray[s][subKeys[j]]
                                  
                                            den *= value[s]
                                  
                                        numList.append(num)
                                  
                                        denList.append(den)
                                  
                                        seq = seq[1:]
                                  
                                       
                                    
                                    resultList = []
                                  
                                    for i, num in enumerate(numList):
                                        #p=log10(num/denList[i])
                                        #if (p) >=2:
                                            #print "#########",abs(int(p))
                                        #if (log10(num/denList[i]))>=2:
                                            #print "i am here"
                                	    resultList.append(int(abs(1)))
                                    #print resultList
                                    #for i in resultList:
                                	#mean=sum(resultList)/len(resultList)
                                        #sub=mean-i
                                        #queue = []
                                        #queue = (sub)**2
                                        #print sqrt(queue/len(resultList))
                                	
                                    #print mean,"@@@@@@@@@@"
                                	
                                        
                                   
                                       
                                  
                                    outStr = '\n'.join(['Sequence = %s Calculation = %d' % (seqList[i], res) for i, res in enumerate(resultList)])
                                    #print "this is line 294"
                                    
                                  
                                    return 'Array set # = %d\nSequence set # = %d\nSequence Header: %s\n%s' % (arraySet, seqSet, dataSeq[0], outStr)
                                    
                                  
                                if __name__ == '__main__':
                                  
                                
                                    fnArray ='C:\python25\half.txt'
                                    fnSeq = 'C:\python25\New Folder'
                                    import os
                                    dir_name='New Folder'
                                    fList=os.listdir(dir_name)
                                    fList1=[os.path.join(dir_name,f) for f in fList if os.path.isfile(os.path.join(dir_name,f))]
                                    seqSetIndex=0
                                    fnSeq=fList1[seqSetIndex]
                                    while True:
                                        
                                    
                                    
                                  
                                        outputfile =  "sequence_calc_data.txt"
                                  
                                         
                                  
                                        arraySet = 1
                                  
                                        outList = []
                                  
                                        calcdata = 1
                                  
                                        while not calcdata is None:
                                  
                                            seqSet = 1
                                  
                                            while True:
                                  
                                                calcdata = compileData(fnArray, fnSeq, arraySet, seqSet)
                                                print calcdata
                                  
                                                if calcdata:
                                  
                                                    outList.append(calcdata)
                                  
                                                    seqSet += 1
                                  
                                                else:
                                  
                                                    break
                                  
                                            arraySet += 1
                                            seqSetIndex+=1
                                        else:
                                            break
                                    
                                
                                        
                                  
                                       
                                  
                                    f = open(outputfile, 'w')
                                  
                                    f.write('\n'.join(outList))
                                  
                                    f.close()
                                    #f=open(outputfile,"r")
                                    #file_con=f.readlines()
                                    #for line in file_con:
                                     #   print line

                                Comment

                                Working...