From 4162cc100f19bdd66e6775d631d430092c492725 Mon Sep 17 00:00:00 2001 From: Jingchao Zhong Date: Tue, 16 May 2023 03:49:18 -0700 Subject: [PATCH 1/5] Add tests for new plugin: Email Address --- tests/plugins/email_address_test.py | 60 +++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 tests/plugins/email_address_test.py diff --git a/tests/plugins/email_address_test.py b/tests/plugins/email_address_test.py new file mode 100644 index 000000000..cb3fccb66 --- /dev/null +++ b/tests/plugins/email_address_test.py @@ -0,0 +1,60 @@ +import pytest +from detect_secrets.plugins.email_address import EmailAddressDetector + + +class TestEmailAddressDetector: + """ + Testing strategy + + Cover the cartesian product of these partitions: + + 1. Partition on email address format: + a. Valid email addresses + b. Invalid email addresses + + 2. Partition on line content: + a. email address is the only content + b. email address is part of a larger string + + And cover these cases: + + 1. Partition on whitelist email addresses: + a. email address is in the whitelist + b. email address is not in the whitelist + """ + + @pytest.mark.parametrize( + 'payload, should_flag', + [ + # Valid email addresses, only content + ('user@example.com', True), + ('user.name@example.com', True), + ('user_name@example.com', True), + ('user-name@example.com', True), + ('user+name@example.com', True), + ('user@ex_ample.com', True), + ('user@-example.com', True), + ('user@example-.com', True), + ('user.name+category@example.com', True), + # Valid email addresses, part of larger string + ('This is an email address: user@example.com', True), + ('user@example.com is a valid email address', True), + # Invalid email addresses + ('user@com', False), + ('@example.com', False), + ('user@.com', False), + ('user@ex..com', False), + # Whitelist email addresses + ('noreply@github.com', False), + ('git@github.com', False), + # Non-whitelist email addresses + ('user@gmail.com', True), + ('user@yahoo.com', True), + ('user@hotmail.com', True), + ], + ) + def test_analyze_line(self, payload, should_flag): + logic = EmailAddressDetector() + + output = logic.analyze_line(filename='mock_filename', line=payload) + assert len(output) == int(should_flag) From d2ffc0ee9b83f0666fceb56d3895e8f5c111ac86 Mon Sep 17 00:00:00 2001 From: Jingchao Zhong Date: Tue, 16 May 2023 03:51:55 -0700 Subject: [PATCH 2/5] Add a plguin for Email Address passing the tests --- detect_secrets/plugins/email_address.py | 52 +++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 detect_secrets/plugins/email_address.py diff --git a/detect_secrets/plugins/email_address.py b/detect_secrets/plugins/email_address.py new file mode 100644 index 000000000..92f2393d5 --- /dev/null +++ b/detect_secrets/plugins/email_address.py @@ -0,0 +1,52 @@ +import re +from .base import RegexBasedDetector + +class EmailAddressDetector(RegexBasedDetector): + """Email Address Detector. + + This class is designed to efficiently and accurately detect email addresses within given text. It primarily + validates the general format of email addresses, and does not adhere strictly to email format standards such as RFC 5322. + + Key Features: + - Ignores common, non-security-threatening email addresses to enhance precision. + + Limitations: + - Despite robust detection mechanisms, the class is not infallible and may not cover all edge cases. + - It does not support some examples from RFC 6530, e.g., email addresses with Greek alphabets. + + References: + - https://en.wikipedia.org/wiki/Email_address + - https://stackoverflow.com/a/14321045 + """ + secret_type = 'Email Address' + + whitelist = ['noreply@github.com', 'git@github.com'] + # Excluses whitelist email addresses from detection to reduce false positives. + + base_pattern = r""" + [\w+-]+ # Local part before the @ symbol + (?:\.[\w+-]+)* # Optional dot-separated words in the local part + @ # The @ symbol + [\w+-]+ # Domain part after the @ symbol + (?:\.[\w+-]+)* # Optional dot-separated words in the domain part + (?:\.[a-zA-Z]{2,4}) # TLD part + """ + # Pattern Breakdown: + # 1. [\w+-]+: Matches one or more of a-z, A-Z, _, +, - + # Represents the local part of the email address before the @ symbol. + # 2. (?:\.[\w+-]+)*: Matches zero or more of a-z, A-Z, _, +, -, but must start with a . (dot) + # Allows for dot-separated words in the local part of the email address. + # 3. @: Matches the @ symbol. + # 4. [\w+-]+: Matches one or more of a-z, A-Z, _, +, - + # Represents the domain part of the email address after the @ symbol. + # 5. (?:\.[\w+-]+)*: Matches zero or more of a-z, A-Z, _, +, -, but must start with a . (dot) + # Allows for dot-separated words in the domain part of the email address. + # 6. (?:\.[a-zA-Z]{2,4}): Matches 2 to 4 instances of a-z, A-Z, starting with a . (dot) + # Represents the TLD (top-level domain) part of the email address. + + deny_pattern = r"(?!" + "|".join(re.escape(email) for email in whitelist) + r"$)" + base_pattern + # Combines the base pattern with a negative lookahead to exclude whitelist email addresses. + + denylist = [ + re.compile(r"\b" + deny_pattern + r"\b", flags=re.VERBOSE) + ] From 61670412bd8baa8606034a82d8f8e821c6c4a186 Mon Sep 17 00:00:00 2001 From: Jingchao Zhong Date: Tue, 16 May 2023 04:11:05 -0700 Subject: [PATCH 3/5] Update documentation for new feature : EmailAddressDetector plguin --- CHANGELOG.md | 4 ++++ README.md | 1 + 2 files changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2f2f363a0..137e13dac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,10 @@ If you love `detect-secrets`, please star our project on GitHub to show your sup ### v1.4.0 diff --git a/README.md b/README.md index 03e7e36fb..a37fb4f7d 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,7 @@ AzureStorageKeyDetector BasicAuthDetector CloudantDetector DiscordBotTokenDetector +EmailAddressDetector GitHubTokenDetector Base64HighEntropyString HexHighEntropyString From b66379160475aa0feaef5f203397e09cb3dfe974 Mon Sep 17 00:00:00 2001 From: Jingchao Zhong <92573736+perryzjc@users.noreply.github.com> Date: Thu, 16 Nov 2023 21:09:37 -0800 Subject: [PATCH 4/5] Fix python code style --- detect_secrets/plugins/email_address.py | 37 +++++++++++++++---------- tests/plugins/email_address_test.py | 1 + 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/detect_secrets/plugins/email_address.py b/detect_secrets/plugins/email_address.py index 92f2393d5..e752546a3 100644 --- a/detect_secrets/plugins/email_address.py +++ b/detect_secrets/plugins/email_address.py @@ -1,27 +1,34 @@ import re + from .base import RegexBasedDetector -class EmailAddressDetector(RegexBasedDetector): - """Email Address Detector. - This class is designed to efficiently and accurately detect email addresses within given text. It primarily - validates the general format of email addresses, and does not adhere strictly to email format standards such as RFC 5322. +class EmailAddressDetector(RegexBasedDetector): + """ + A detector for identifying email addresses within text. It uses regular expressions to + focus on general email structures, not strictly adhering to standards like RFC 5322. + Designed for efficient and broad detection, it also has some limitations. - Key Features: - - Ignores common, non-security-threatening email addresses to enhance precision. + Features: + - Detects a wide range of email formats efficiently. + - Ignores common, non-critical emails to minimize false positives. Limitations: - - Despite robust detection mechanisms, the class is not infallible and may not cover all edge cases. - - It does not support some examples from RFC 6530, e.g., email addresses with Greek alphabets. + - May miss edge cases or unconventional email formats. + - Not compliant with advanced formats, e.g., RFC 6530 non-Latin emails. - References: + Regular Expression: + Utilizes a regex pattern focusing on typical email components: local part, domain, TLD. + Excludes predefined whitelist emails to reduce false positives. + + References: - https://en.wikipedia.org/wiki/Email_address - https://stackoverflow.com/a/14321045 """ secret_type = 'Email Address' - whitelist = ['noreply@github.com', 'git@github.com'] # Excluses whitelist email addresses from detection to reduce false positives. + whitelist = ['noreply@github.com', 'git@github.com'] base_pattern = r""" [\w+-]+ # Local part before the @ symbol @@ -32,21 +39,23 @@ class EmailAddressDetector(RegexBasedDetector): (?:\.[a-zA-Z]{2,4}) # TLD part """ # Pattern Breakdown: - # 1. [\w+-]+: Matches one or more of a-z, A-Z, _, +, - + # 1. [\w+-]+: Matches one or more of a-z, A-Z, _, +, - # Represents the local part of the email address before the @ symbol. # 2. (?:\.[\w+-]+)*: Matches zero or more of a-z, A-Z, _, +, -, but must start with a . (dot) # Allows for dot-separated words in the local part of the email address. # 3. @: Matches the @ symbol. - # 4. [\w+-]+: Matches one or more of a-z, A-Z, _, +, - + # 4. [\w+-]+: Matches one or more of a-z, A-Z, _, +, - # Represents the domain part of the email address after the @ symbol. # 5. (?:\.[\w+-]+)*: Matches zero or more of a-z, A-Z, _, +, -, but must start with a . (dot) # Allows for dot-separated words in the domain part of the email address. # 6. (?:\.[a-zA-Z]{2,4}): Matches 2 to 4 instances of a-z, A-Z, starting with a . (dot) # Represents the TLD (top-level domain) part of the email address. - deny_pattern = r"(?!" + "|".join(re.escape(email) for email in whitelist) + r"$)" + base_pattern + deny_pattern = r'(?!' \ + + '|'.join(re.escape(email) for email in whitelist) \ + + r'$)' + base_pattern # Combines the base pattern with a negative lookahead to exclude whitelist email addresses. denylist = [ - re.compile(r"\b" + deny_pattern + r"\b", flags=re.VERBOSE) + re.compile(r'\b' + deny_pattern + r'\b', flags=re.VERBOSE), ] diff --git a/tests/plugins/email_address_test.py b/tests/plugins/email_address_test.py index cb3fccb66..d33b3b4f6 100644 --- a/tests/plugins/email_address_test.py +++ b/tests/plugins/email_address_test.py @@ -1,4 +1,5 @@ import pytest + from detect_secrets.plugins.email_address import EmailAddressDetector From 7d4bc6763c0fff5813389b9438bb19b17fb9b51b Mon Sep 17 00:00:00 2001 From: Jingchao Zhong <92573736+perryzjc@users.noreply.github.com> Date: Thu, 16 Nov 2023 21:25:55 -0800 Subject: [PATCH 5/5] Add more test cases --- tests/plugins/email_address_test.py | 51 +++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/tests/plugins/email_address_test.py b/tests/plugins/email_address_test.py index d33b3b4f6..89e452ae2 100644 --- a/tests/plugins/email_address_test.py +++ b/tests/plugins/email_address_test.py @@ -52,6 +52,57 @@ class TestEmailAddressDetector: ('user@gmail.com', True), ('user@yahoo.com', True), ('user@hotmail.com', True), + + # Additional test cases + + # Valid email addresses with different domain extensions + ('user@domain.co.uk', True), + ('user@domain.io', True), + ('user@domain.org', True), + ('user@sub.domain.com', True), + + # Valid email addresses with numbers + ('user123@example.com', True), + ('123user@example.com', True), + ('user123@123example.com', True), + + # Valid email addresses, part of larger text with special characters + ('Contact us at: user@example.com!', True), + ('Email: user@example.com for more info.', True), + + # Invalid email addresses with missing components + ('user@example', False), + ('user@.example.com', False), + ('@example.com', False), + ('user@', False), + + # Invalid email addresses with special characters + ('user@exa*mple.com', False), + ('user@examp!e.com', False), + ('user@exampl$.com', False), + ('user@exam^ple.com', False), + + # Unusual formats, mark as false + ('"user"@example.com', False), # Quoted local part + ('user@[123.123.123.123]', False), # IP address domain + + # Invalid email addresses, incorrect use of special characters + ('user@exa,mple.com', False), + ('user@exampcom', False), + ('user@exampl;e.com', False), + + # Edge cases - rare but valid email formats + ('user+mailbox/department=shipping@example.com', True), + ('customer/department=shipping@example.com', True), + ('!def!xyz%abc@example.com', True), + ('_Yosemite.Sam@example.com', True), + + # Edge cases - position of . (dot) + ('user@example..com', False), # Double dot in domain + ('.user@example.com', True), # Leading dot in local part + ('user@.example.com', False), # Leading dot in domain + ('user@example.com.', True), # Trailing dot in domain ], ) def test_analyze_line(self, payload, should_flag):