append docname & linenumber to dic-value

Collapse
X
 
  • Time
  • Show
Clear All
new posts
  • simic
    New Member
    • Dec 2007
    • 1

    append docname & linenumber to dic-value

    Hey guys,
    I am comparing two documents - if a word is in both documents, it gets added as a new key to a dictionary.
    As the dictionary value I would like to store the documents name and the line# the word was found on.
    Here is what I have so far with comments:

    Code:
    dic = {}
    def matchtermer():
        f3 = open('korpus/avis.txt')
        f4 = open("ordliste_output_kort.txt")
        text3 = f3.read()
        text4 = f4.read()
        ordliste2 = text3.split()
        ordliste3 = text4.split()
        wordlist2 = []
    
        for word1 in ordliste2: #this part removes end characters that aren't part of the word and makes all lowercase
            # last character of each word
            lastchar = word1[-1:]
            # use a list of punctuation marks
            if lastchar in [",", ".", "!", "?", ";"]:
                word2 = word1.rstrip(lastchar)
            else:
                word2 = word1
            # build a wordList of lower case modified words
            wordlist2.append(word2.lower())
    
        for word in wordlist2: # and finally this compares the two documents
            if word in ordliste3:
                if word not in dic.keys():
                    dic[word]=[]  #if word not in dic, create it
                #dic[word].append(docname, linenumber) - this is what I want to do - obviously this does not work
        return dic
  • bvdet
    Recognized Expert Specialist
    • Oct 2006
    • 2851

    #2
    I think this will do it:[code=Python]import string, re

    def wordList(words) :
    patt = re.compile(r'\d +')
    # eliminate words with digits, strip punctuation and whitespace, lowercase
    word_list = [word.strip().st rip(string.punc tuation).lower( ) for word \
    in words.split() if not patt.search(wor d)]
    # elinimate blank words
    return [word for word in word_list if word != '']

    def matchtermer(fn1 , fn2):
    dd = {}
    # file to compare against
    f1 = open(fn1).read( )
    # file to compare
    f2 = open(fn2).readl ines()
    word_list = wordList(f1)
    for i, line in enumerate(f2):
    for word in line.split():
    word = word.strip().st rip(string.punc tuation).lower( )
    if word in word_list:
    dd.setdefault(w ord, []).append((fn2, i+1))
    return dd[/code]Usage:[code=Python]wordDict = matchtermer('wo rds1.txt', 'words2.txt')[/code]

    Comment

    Working...