Re: parsing incoming emails

Collapse
This topic is closed.
X
X
 
  • Time
  • Show
Clear All
new posts
  • Michiel Overtoom

    Re: parsing incoming emails

    Ahmed wrote...
    I am working on a project where I need to parse incoming emails
    (Microsoft outlook)
    I'm not sure if you are able to bypass Outlook (and have Python fetch the
    mail itself using poplib), but if you are, the following code might be
    useful. I use this to pry apart emails which might contain multiple MIME parts.

    from email.Parser import Parser
    from rfc822 import parseaddr
    import poplib
    import smtplib

    popserver="pop. site.com"
    popuser="user@s ite.com"
    poppassword="se cret"

    # split a message into an header- and body part
    def separate(msg):
    if isinstance(msg, str):
    msg=msg.split(' \n')
    emptyline=msg.i ndex('')
    return msg[:emptyline],msg[emptyline+1:]


    # return a certain headerline from the headers
    def headerline(head er,tag="From: "):
    for h in header:
    if h.startswith(ta g):
    return h[len(tag)+1:]
    return ""


    # enumerate recursively the contents of a MIME message
    # remember the first text/plain and text/html part(s) that is found
    # also remember if any other parts were found (like attachments)
    #
    def enummimeparts(m sg,extract,leve l=1,verbose=Fal se):
    m=Parser().pars estr(msg)
    if m.is_multipart( ):
    if verbose: print '\t'*level,'mul tipart'
    for part in m.get_payload() :
    enummimeparts(p art.as_string() ,extract,level+ 1,verbose)
    else:
    t=m.get_content _type()
    if verbose: print '\t'*level,t
    if t=="text/plain":
    if not "text/plain" in extract:
    headers,body=se parate(m.as_str ing())
    extract["text/plain"]='\n'.join(body )
    else:
    extract["others"]=True
    elif t=="text/html":
    if not "text/html" in extract:
    headers,body=se parate(m.as_str ing())
    extract["text/html"]='\n'.join(body )
    else:
    extract["others"]=True
    else:
    extract["others"]=True


    # extract the first 'text/plain' and 'text/html' mime-parts from a message
    def extracttext(msg ):
    extract={}
    enummimeparts(m sg,extract)
    return
    extract.get("te xt/plain",None),ex tract.get("text/html",None),ext ract.get("ot
    hers",False)


    def processmessage( msgnr):
    # get a message from the POP server, extract the parts
    response,lines, bytes=pop.retr( msgnr)
    msg='\n'.join(l ines)
    headers,body=se parate(lines)
    name,fromaddres s=parseaddr(hea derline(headers ,"From:"))
    subject=headerl ine(headers,"Su bject:")
    logging.info(su bject+" ("+fromaddress+ ")")
    (plain,html,oth ers)=extracttex t(msg)
    # prefer flat text; if not present in the message, fallback to HTML
    content (if any)
    texttoprocess=" "
    if plain:
    texttoprocess=p lain
    elif html:
    texttoprocess=h tml
    # now do something useful with the text
    processtext(tex ttoprocess)
    # delete message from pop server after processing
    pop.dele(msgnr)


    # connect to the pop server and process all messages
    logging.info("C hecking pop server '%s', user '%s'" % (popserver,popu ser))
    pop=poplib.POP3 (popserver)
    pop.user(popuse r)
    pop.pass_(poppa ssword)
    stat=pop.stat()
    if stat[0]:
    for n in range(stat[0]):
    processmessage( n+1)
    pop.quit()


    --
    "The ability of the OSS process to collect and harness
    the collective IQ of thousands of individuals across
    the Internet is simply amazing." - Vinod Vallopillil


Working...