Browse Source

Try both utf-8 and windows-1252 for decoding email

Recent submissions from Cirrus were classified as spam by the lore
analysis robot script.  This is because cirrus used windows-1252 for
the encoding which failed to decode as utf-8.

Try both encodings when decoding email.

Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
main
Mario Limonciello 1 year ago
parent
commit
8228c2222f
  1. 31
      contrib/process_linux_firmware.py

31
contrib/process_linux_firmware.py

@ -34,6 +34,8 @@ content_types = { @@ -34,6 +34,8 @@ content_types = {
def classify_content(content):
# load content into the email library
msg = email.message_from_string(content)
decoded = None
body = None

# check the subject
subject = msg["Subject"]
@ -42,17 +44,28 @@ def classify_content(content): @@ -42,17 +44,28 @@ def classify_content(content):
if "PATCH" in subject:
return ContentType.PATCH

for part in msg.walk():
if part.get_content_type() == "text/plain":
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain":
body = part.get_payload(decode=True)
else:
body = msg.get_payload(decode=True)

if body:
for encoding in ["utf-8", "windows-1252"]:
try:
body = part.get_payload(decode=True).decode("utf-8")
for key in content_types.keys():
if key in body:
return content_types[key]
break
except UnicodeDecodeError as e:
logging.warning("Failed to decode email: %s, treating as SPAM" % e)
decoded = body.decode(encoding)
break
except UnicodeDecodeError:
pass

if decoded:
for key in content_types.keys():
if key in decoded:
return content_types[key]
else:
logging.warning("Failed to decode email: %s, treating as SPAM", body)

return ContentType.SPAM



Loading…
Cancel
Save