email parsing

**ra9ftm** · Aug 27 '08, 04:45 PM

It is my first script on python. Don't know is it correctly uses
modules, but it is working fine with specially with russian code pages
and mime formated messages. Also quoted-printable and base64
encoded....

It will be very good if anybody post any comments on this script. Is
it good or bad...

import email
import mailbox
from email.Header import decode_header
from email.Header import make_header
import string
import sys

outEnc="cp866"
infile=sys.argv[1]

subStrObrez = []
subStrObrez.app end("~~~~~~~~~~ ~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~ ~~~~~")
subStrObrez.app end("""~~~~~~~~ ~~~~~~~~~~~~~~~ ~~~~~~~~
To UNSUBSCRIBE from this forum, send an email to:""")
subStrObrez.app end("~~~~~~~~~~ ~~~~~~~~")

# Cut yahoo info at the end of message
def obrez(strMsg):
for s in subStrObrez:
n = string.rfind(st rMsg,s)
if n != -1:
return strMsg[0:n]
return strMsg

# Convert message header
def my_get_header(s tr):
str2=""
for val,encoding in decode_header(s tr):
if encoding:
str2 = str2+ val.decode(enco ding)+" "
else:
str2 = str2+ val+" "
return str2

# Process the message
def proc(msg):
print 'From : '+ my_get_header(m sg['From']).encode(outEnc )
print 'To : '+ my_get_header(m sg['To']).encode(outEnc )
print 'Subject: '+ my_get_header(m sg['Subject']).encode(outEnc )
print

if msg.is_multipar t():
for part in msg.walk():
if part.get_conten t_type() == "text/plain":
if part.get_conten t_charset():
print
obrez(part.get_ payload(None,Tr ue).decode(part .get_content_ch arset()).encode (outEnc))
else:
print obrez(part.get_ payload(None,Tr ue))

else:
if msg.get_content _type() == "text/plain":
if msg.get_content _charset():
print
obrez( (msg.get_payloa d(None,True)).d ecode(msg.get_c ontent_charset( )) ).encode(outEnc )
else:
print obrez( msg.get_payload (None,True) )
else:
if msg.get_content _type() == "text/html":
if msg.get_content _charset():
print
(msg.get_payloa d(None,True)).d ecode(msg.get_c ontent_charset( )).encode(outEn c)
else:
print msg.get_payload (None,True)

############### ############### ############### ############### ############### #########
# The main program

f = open(infile, "rb")
m1 = mailbox.UnixMai lbox(f)

RubLst=[]
RubLst.append(["[contestru]","FOTSTR"])
RubLst.append(["[russiandx]","FORUDX"])

for msg in mailbox.UnixMai lbox(f,email.me ssage_from_file ):
for rub in RubLst:
if string.find(my_ get_header(msg['Subject']),rub[0]) != -1 :
print "SB "+rub[1]+"@FORUM < INET"
print my_get_header(m sg['Subject']).encode(outEnc )
print
proc(msg)
print
print "powered by Python"
print "/EX"