-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_controls.py
172 lines (148 loc) · 6.34 KB
/
extract_controls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#import openai
import pdfplumber
import csv
import re
def read_pdf(file, pages):
"""
Extract content specified pages in a pdf file and returns as a string.
If you want to read the entire pdf, use pages == []
Example 1:
file = "test.pdf"
pages = [1,2,3,8]
read_pdf(file, pages)
Example 2:
file = "test.pdf"
pages = []
read_pdf(file, pages)
"""
text = ''
# Open pdf file and retrieve content of the above pages
with pdfplumber.open(paperFilePath) as pdf:
# Check the number of pages in the PDF file
num_pages = len(pdf.pages)
print(f"Number of pages in PDF file: {num_pages}")
if pages == []: pages = list(range(1,num_pages,1))
# Iterate through each page in the index_pages list
for page_num in index_pages:
# Check if the page number is valid
if page_num >= num_pages:
print(f"Invalid page number: {page_num}")
continue
# Extract the text from the page
page_text = pdf.pages[page_num].extract_text()
if page_text:
text += " " + page_text
else:
print(f"No text found on page {page_num}")
return text
def remove_dots(input_string):
"""
Removes the sequence of dots from input string
"""
# Replace the sequence of two or more dots with !
return re.sub(r'\.{2,}', '!', input_string)
def remove_sections(input_string):
"""
Removes any sections that are not under "Recommendations". Assumes starter and end strings.
"""
start = "Recommendations"
end = "Appendix: "
start_index = input_string.find(start)
end_index = input_string.find(end)
if start_index != -1 and end_index != -1:
input_string = input_string[start_index:end_index].strip()
#print(input_string)
else:
print("Couldn't find start and/or end markers")
return input_string
def remove_mid_newlines(input_string):
"""
Removes any new line chars that are not at the end of a rule name - assumes dots were replaced by '!'
"""
# Remove any other newlines that are not preceded by an exclamation mark, space, and a number. This is necessary
# because some rules, like 6.2.6 have new line chars that break the main regex
# Regex:
# # - using negative capture group "(?<! exp) to indicate it should ignore anything matching expression 'exp'
# # - [! \d+] representes the sequence exclamation sig, space, any number of digits
# # - \n is new line "
return re.sub(r'(?<![! \d+])\n', ' ', input_string)
def extract_rules(input_string):
"""
After original text is cleaned up, splits up the rules in multiple lines separated by new line char'
"""
# Regular expression pattern to match the rule number dot, one or multiple times, space and name, ending in exclamation sign, space and a digit
pattern = r'(\d+\.\d+[.\d+]*)(.*)?\n?(.*)(?:! \d+)'
# initialize output list with header
output_list = [['RULE_NUMBER', 'RULE_NAME']]
# Let's split input string by newline character
lines = input_string.split('\n')
# loop through each line and extract rule number and name
for line in lines:
match = re.match(pattern, line)
if match:
rule_number = match.group(1)
rule_name = match.group(2)
# append to output list as a new row
output_list.append([rule_number, rule_name])
return output_list
def generate_csv(input_list):
"""
Generates csv file based on input
"""
# write output to csv file
with open('output.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(input_list)
# Pdf file path
paperFilePath = "CIS_benchmarks/CIS_Google_Cloud_Platform_Foundation_Benchmark_v2.0.0.pdf"
#paperFilePath = "CIS_benchmarks/CIS_Amazon_Web_Services_Foundations_Benchmark_v1.5.0.pdf"
# Despite having 300+ pages, let's just focus on the index pages
index_pages = [2,3,4,5, 6]
text = read_pdf(paperFilePath,index_pages)
# If we eventually were to use ChatGPT, to keep track of number of tokens
print("Number of characters before cleanup:", len(text))
text = remove_dots(text)
text = remove_sections(text)
text = remove_mid_newlines(text)
# If we eventually were to use ChatGPT, to keep track of number of tokens
print("Number of characters after cleanup:", len(text))
list = extract_rules(text)
generate_csv(list)
#def GPT_Completion(texts):
## Call the API key under your account (in a secure way)
# openai.api_key = "API KEY"
# response = openai.Completion.create(
# engine="text-davinci-002",
# prompt = texts,
# temperature = 0.6,
# top_p = 1,
# max_tokens = 3000,
# frequency_penalty = 0,
# presence_penalty = 0
# )
# return(response.choices[0].text)
#def create_CSV(input_str):
# Split the input string into separate lines
# lines = input_str.splitlines()
# Create a new CSV file and open it in write mode
# with open('output.csv', 'w', newline='') as csvfile:
# Create a csv writer object
# csvwriter = csv.writer(csvfile, delimiter=',')
# Write the headers to the CSV file
# csvwriter.writerow(['RULE_NUMBER', 'RULE_NAME'])
# Write the data to the CSV file
# for line in lines[1:]:
# fields = line.split(',', maxsplit=1)
# csvwriter.writerow(fields)
# print("CSV file created successfully.")
#query = 'With the following text, give me a list of NUMBER and TITLE. In the first example, NUMBER is 1.1 and TITLE is Ensure that Corporate Login Credentials are Used (Manual). In the second, NUMBER is 1.2 and TITLE is Ensure that Multi-Factor Authentication is Enabled for All Non-Service Accounts (Manual) ' + text
#n=0
#for line in lines:
# query = "Generate a cvs formatted output following this pattern: \
# RULE_NUMBER,RULE_NAME \
# 1.1,Ensure that Corporate Login Credentials are Used (Manual) \
# 1.2,Ensure that Multi-Factor Authentication is 'Enabled' for All Non-Service Accounts (Manual) \
# 1.3,Ensure that Security Key Enforcement is Enabled for All Admin Accounts (Manual) \
# ,only capturing the values that are preceded by a number or a number period another number as RULE_NUMBER, and the text next to it as RULE_NAME, based on this input: " + line
#result = GPT_Completion(query)
#create_CSV(result)