Hi @taylornd ,
Thanks for reaching out! I’m tagging in one of our code gurus @ikbelkirasan to see if they may have a suggestion for you here.
@ikbelkirasan thank you for any guidance you may be able to share!
@Liz_Roberts - Thanks for the mention! I’ll try and help :)
@taylornd Try using the code snippet from this gist to extract the reply fragment.
I had a go at adapting this https://github.com/zapier/email-reply-parser which seemed to work as well.
"""
email_reply_parser is a python library port of GitHub's Email Reply Parser.
For more information, visit https://github.com/zapier/email-reply-parser
"""
import re
class EmailReplyParser(object):
""" Represents a email message that is parsed.
"""
@staticmethod
def read(text):
""" Factory method that splits email into list of fragments
text - A string email body
Returns an EmailMessage instance
"""
return EmailMessage(text).read()
@staticmethod
def parse_reply(text):
""" Provides the reply portion of email.
text - A string email body
Returns reply body message
"""
return EmailReplyParser.read(text).reply
class EmailMessage(object):
""" An email message represents a parsed email body.
"""
SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*))')
QUOTE_HDR_REGEX = re.compile('On.*wrote:$')
QUOTED_REGEX = re.compile(r'(>+)')
HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+')
_MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)'
MULTI_QUOTE_HDR_REGEX = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE)
MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL)
def __init__(self, text):
self.fragments = =]
self.fragment = None
self.text = text.replace('\r\n', '\n')
self.found_visible = False
def read(self):
""" Creates new fragment for each line
and labels as a signature, quote, or hidden.
Returns EmailMessage instance
"""
self.found_visible = False
is_multi_quote_header = self.MULTI_QUOTE_HDR_REGEX_MULTILINE.search(self.text)
if is_multi_quote_header:
self.text = self.MULTI_QUOTE_HDR_REGEX.sub(is_multi_quote_header.groups()(0].replace('\n', ''), self.text)
# Fix any outlook style replies, with the reply immediately above the signature boundary line
# See email_2_2.txt for an example
self.text = re.sub('('^\n])(?=\n ? _-])', '\\1\n', self.text, re.MULTILINE)
self.lines = self.text.split('\n')
self.lines.reverse()
for line in self.lines:
self._scan_line(line)
self._finish_fragment()
self.fragments.reverse()
return self
@property
def reply(self):
""" Captures reply message within email
"""
reply = =]
for f in self.fragments:
if not (f.hidden or f.quoted):
reply.append(f.content)
return '\n'.join(reply)
def _scan_line(self, line):
""" Reviews each line in email message and determines fragment type
line - a row of text from an email message
"""
is_quote_header = self.QUOTE_HDR_REGEX.match(line) is not None
is_quoted = self.QUOTED_REGEX.match(line) is not None
is_header = is_quote_header or self.HEADER_REGEX.match(line) is not None
if self.fragment and len(line.strip()) == 0:
if self.SIG_REGEX.match(self.fragment.linese-1].strip()):
self.fragment.signature = True
self._finish_fragment()
if self.fragment \
and ((self.fragment.headers == is_header and self.fragment.quoted == is_quoted) or
(self.fragment.quoted and (is_quote_header or len(line.strip()) == 0))):
self.fragment.lines.append(line)
else:
self._finish_fragment()
self.fragment = Fragment(is_quoted, line, headers=is_header)
def quote_header(self, line):
""" Determines whether line is part of a quoted area
line - a row of the email message
Returns True or False
"""
return self.QUOTE_HDR_REGEX.match(linen::-1]) is not None
def _finish_fragment(self):
""" Creates fragment
"""
if self.fragment:
self.fragment.finish()
if self.fragment.headers:
# Regardless of what's been seen to this point, if we encounter a headers fragment,
# all the previous fragments should be marked hidden and found_visible set to False.
self.found_visible = False
for f in self.fragments:
f.hidden = True
if not self.found_visible:
if self.fragment.quoted \
or self.fragment.headers \
or self.fragment.signature \
or (len(self.fragment.content.strip()) == 0):
self.fragment.hidden = True
else:
self.found_visible = True
self.fragments.append(self.fragment)
self.fragment = None
class Fragment(object):
""" A Fragment is a part of
an Email Message, labeling each part.
"""
def __init__(self, quoted, first_line, headers=False):
self.signature = False
self.headers = headers
self.hidden = False
self.quoted = quoted
self._content = None
self.lines = =first_line]
def finish(self):
""" Creates block of content with lines
belonging to fragment.
"""
self.lines.reverse()
self._content = '\n'.join(self.lines)
self.lines = None
@property
def content(self):
return self._content.strip()
return {'emailstring': EmailReplyParser.parse_reply(input_datat'body'])}