Re: parsing incoming emails

**Michiel Overtoom** · Jul 10 '08, 08:15 PM

Ahmed wrote...

I am working on a project where I need to parse incoming emails
(Microsoft outlook)

I'm not sure if you are able to bypass Outlook (and have Python fetch the
mail itself using poplib), but if you are, the following code might be
useful. I use this to pry apart emails which might contain multiple MIME parts.

from email.Parser import Parser
from rfc822 import parseaddr
import poplib
import smtplib

popserver="pop. site.com"
popuser="user@s ite.com"
poppassword="se cret"

# split a message into an header- and body part
def separate(msg):
if isinstance(msg, str):
msg=msg.split(' \n')
emptyline=msg.i ndex('')
return msg[:emptyline],msg[emptyline+1:]

# return a certain headerline from the headers
def headerline(head er,tag="From: "):
for h in header:
if h.startswith(ta g):
return h[len(tag)+1:]
return ""

# enumerate recursively the contents of a MIME message
# remember the first text/plain and text/html part(s) that is found
# also remember if any other parts were found (like attachments)
#
def enummimeparts(m sg,extract,leve l=1,verbose=Fal se):
m=Parser().pars estr(msg)
if m.is_multipart( ):
if verbose: print '\t'*level,'mul tipart'
for part in m.get_payload() :
enummimeparts(p art.as_string() ,extract,level+ 1,verbose)
else:
t=m.get_content _type()
if verbose: print '\t'*level,t
if t=="text/plain":
if not "text/plain" in extract:
headers,body=se parate(m.as_str ing())
extract["text/plain"]='\n'.join(body )
else:
extract["others"]=True
elif t=="text/html":
if not "text/html" in extract:
headers,body=se parate(m.as_str ing())
extract["text/html"]='\n'.join(body )
else:
extract["others"]=True
else:
extract["others"]=True

# extract the first 'text/plain' and 'text/html' mime-parts from a message
def extracttext(msg ):
extract={}
enummimeparts(m sg,extract)
return
extract.get("te xt/plain",None),ex tract.get("text/html",None),ext ract.get("ot
hers",False)

def processmessage( msgnr):
# get a message from the POP server, extract the parts
response,lines, bytes=pop.retr( msgnr)
msg='\n'.join(l ines)
headers,body=se parate(lines)
name,fromaddres s=parseaddr(hea derline(headers ,"From:"))
subject=headerl ine(headers,"Su bject:")
logging.info(su bject+" ("+fromaddress+ ")")
(plain,html,oth ers)=extracttex t(msg)
# prefer flat text; if not present in the message, fallback to HTML
content (if any)
texttoprocess=" "
if plain:
texttoprocess=p lain
elif html:
texttoprocess=h tml
# now do something useful with the text
processtext(tex ttoprocess)
# delete message from pop server after processing
pop.dele(msgnr)

# connect to the pop server and process all messages
logging.info("C hecking pop server '%s', user '%s'" % (popserver,popu ser))
pop=poplib.POP3 (popserver)
pop.user(popuse r)
pop.pass_(poppa ssword)
stat=pop.stat()
if stat[0]:
for n in range(stat[0]):
processmessage( n+1)
pop.quit()

--
"The ability of the OSS process to collect and harness
the collective IQ of thousands of individuals across
the Internet is simply amazing." - Vinod Vallopillil

Halloween Document 4

http://www.catb.org/~esr/halloween/halloween4.html

On or about Jan 3 1999, my satire