From 8228c2222fcf5791fe5643252e4d248839c199e9 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 16 Nov 2023 10:42:10 -0600 Subject: [PATCH] Try both utf-8 and windows-1252 for decoding email Recent submissions from Cirrus were classified as spam by the lore analysis robot script. This is because cirrus used windows-1252 for the encoding which failed to decode as utf-8. Try both encodings when decoding email. Signed-off-by: Mario Limonciello --- contrib/process_linux_firmware.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/contrib/process_linux_firmware.py b/contrib/process_linux_firmware.py index 668e35c0..ea108391 100755 --- a/contrib/process_linux_firmware.py +++ b/contrib/process_linux_firmware.py @@ -34,6 +34,8 @@ content_types = { def classify_content(content): # load content into the email library msg = email.message_from_string(content) + decoded = None + body = None # check the subject subject = msg["Subject"] @@ -42,17 +44,28 @@ def classify_content(content): if "PATCH" in subject: return ContentType.PATCH - for part in msg.walk(): - if part.get_content_type() == "text/plain": + if msg.is_multipart(): + for part in msg.walk(): + if part.get_content_type() == "text/plain": + body = part.get_payload(decode=True) + else: + body = msg.get_payload(decode=True) + + if body: + for encoding in ["utf-8", "windows-1252"]: try: - body = part.get_payload(decode=True).decode("utf-8") - for key in content_types.keys(): - if key in body: - return content_types[key] - break - except UnicodeDecodeError as e: - logging.warning("Failed to decode email: %s, treating as SPAM" % e) + decoded = body.decode(encoding) break + except UnicodeDecodeError: + pass + + if decoded: + for key in content_types.keys(): + if key in decoded: + return content_types[key] + else: + logging.warning("Failed to decode email: %s, treating as SPAM", body) + return ContentType.SPAM