diff --git a/CHANGELOG.md b/CHANGELOG.md index 2f2f363a0..137e13dac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,10 @@ If you love `detect-secrets`, please star our project on GitHub to show your sup ### v1.4.0 diff --git a/README.md b/README.md index 03e7e36fb..a37fb4f7d 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,7 @@ AzureStorageKeyDetector BasicAuthDetector CloudantDetector DiscordBotTokenDetector +EmailAddressDetector GitHubTokenDetector Base64HighEntropyString HexHighEntropyString diff --git a/detect_secrets/plugins/email_address.py b/detect_secrets/plugins/email_address.py new file mode 100644 index 000000000..e752546a3 --- /dev/null +++ b/detect_secrets/plugins/email_address.py @@ -0,0 +1,61 @@ +import re + +from .base import RegexBasedDetector + + +class EmailAddressDetector(RegexBasedDetector): + """ + A detector for identifying email addresses within text. It uses regular expressions to + focus on general email structures, not strictly adhering to standards like RFC 5322. + Designed for efficient and broad detection, it also has some limitations. + + Features: + - Detects a wide range of email formats efficiently. + - Ignores common, non-critical emails to minimize false positives. + + Limitations: + - May miss edge cases or unconventional email formats. + - Not compliant with advanced formats, e.g., RFC 6530 non-Latin emails. + + Regular Expression: + Utilizes a regex pattern focusing on typical email components: local part, domain, TLD. + Excludes predefined whitelist emails to reduce false positives. + + References: + - https://en.wikipedia.org/wiki/Email_address + - https://stackoverflow.com/a/14321045 + """ + secret_type = 'Email Address' + + # Excluses whitelist email addresses from detection to reduce false positives. + whitelist = ['noreply@github.com', 'git@github.com'] + + base_pattern = r""" + [\w+-]+ # Local part before the @ symbol + (?:\.[\w+-]+)* # Optional dot-separated words in the local part + @ # The @ symbol + [\w+-]+ # Domain part after the @ symbol + (?:\.[\w+-]+)* # Optional dot-separated words in the domain part + (?:\.[a-zA-Z]{2,4}) # TLD part + """ + # Pattern Breakdown: + # 1. [\w+-]+: Matches one or more of a-z, A-Z, _, +, - + # Represents the local part of the email address before the @ symbol. + # 2. (?:\.[\w+-]+)*: Matches zero or more of a-z, A-Z, _, +, -, but must start with a . (dot) + # Allows for dot-separated words in the local part of the email address. + # 3. @: Matches the @ symbol. + # 4. [\w+-]+: Matches one or more of a-z, A-Z, _, +, - + # Represents the domain part of the email address after the @ symbol. + # 5. (?:\.[\w+-]+)*: Matches zero or more of a-z, A-Z, _, +, -, but must start with a . (dot) + # Allows for dot-separated words in the domain part of the email address. + # 6. (?:\.[a-zA-Z]{2,4}): Matches 2 to 4 instances of a-z, A-Z, starting with a . (dot) + # Represents the TLD (top-level domain) part of the email address. + + deny_pattern = r'(?!' \ + + '|'.join(re.escape(email) for email in whitelist) \ + + r'$)' + base_pattern + # Combines the base pattern with a negative lookahead to exclude whitelist email addresses. + + denylist = [ + re.compile(r'\b' + deny_pattern + r'\b', flags=re.VERBOSE), + ] diff --git a/tests/plugins/email_address_test.py b/tests/plugins/email_address_test.py new file mode 100644 index 000000000..89e452ae2 --- /dev/null +++ b/tests/plugins/email_address_test.py @@ -0,0 +1,112 @@ +import pytest + +from detect_secrets.plugins.email_address import EmailAddressDetector + + +class TestEmailAddressDetector: + """ + Testing strategy + + Cover the cartesian product of these partitions: + + 1. Partition on email address format: + a. Valid email addresses + b. Invalid email addresses + + 2. Partition on line content: + a. email address is the only content + b. email address is part of a larger string + + And cover these cases: + + 1. Partition on whitelist email addresses: + a. email address is in the whitelist + b. email address is not in the whitelist + """ + + @pytest.mark.parametrize( + 'payload, should_flag', + [ + # Valid email addresses, only content + ('user@example.com', True), + ('user.name@example.com', True), + ('user_name@example.com', True), + ('user-name@example.com', True), + ('user+name@example.com', True), + ('user@ex_ample.com', True), + ('user@-example.com', True), + ('user@example-.com', True), + ('user.name+category@example.com', True), + # Valid email addresses, part of larger string + ('This is an email address: user@example.com', True), + ('user@example.com is a valid email address', True), + # Invalid email addresses + ('user@com', False), + ('@example.com', False), + ('user@.com', False), + ('user@ex..com', False), + # Whitelist email addresses + ('noreply@github.com', False), + ('git@github.com', False), + # Non-whitelist email addresses + ('user@gmail.com', True), + ('user@yahoo.com', True), + ('user@hotmail.com', True), + + # Additional test cases + + # Valid email addresses with different domain extensions + ('user@domain.co.uk', True), + ('user@domain.io', True), + ('user@domain.org', True), + ('user@sub.domain.com', True), + + # Valid email addresses with numbers + ('user123@example.com', True), + ('123user@example.com', True), + ('user123@123example.com', True), + + # Valid email addresses, part of larger text with special characters + ('Contact us at: user@example.com!', True), + ('Email: user@example.com for more info.', True), + + # Invalid email addresses with missing components + ('user@example', False), + ('user@.example.com', False), + ('@example.com', False), + ('user@', False), + + # Invalid email addresses with special characters + ('user@exa*mple.com', False), + ('user@examp!e.com', False), + ('user@exampl$.com', False), + ('user@exam^ple.com', False), + + # Unusual formats, mark as false + ('"user"@example.com', False), # Quoted local part + ('user@[123.123.123.123]', False), # IP address domain + + # Invalid email addresses, incorrect use of special characters + ('user@exa,mple.com', False), + ('user@exampcom', False), + ('user@exampl;e.com', False), + + # Edge cases - rare but valid email formats + ('user+mailbox/department=shipping@example.com', True), + ('customer/department=shipping@example.com', True), + ('!def!xyz%abc@example.com', True), + ('_Yosemite.Sam@example.com', True), + + # Edge cases - position of . (dot) + ('user@example..com', False), # Double dot in domain + ('.user@example.com', True), # Leading dot in local part + ('user@.example.com', False), # Leading dot in domain + ('user@example.com.', True), # Trailing dot in domain + ], + ) + def test_analyze_line(self, payload, should_flag): + logic = EmailAddressDetector() + + output = logic.analyze_line(filename='mock_filename', line=payload) + assert len(output) == int(should_flag)