-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgenerate_readme.py
301 lines (236 loc) · 13.2 KB
/
generate_readme.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
#!/usr/bin/env python3
import os
import json
from pathlib import Path
from typing import Dict, List, Optional, Set, Any, Tuple
# Constants
EXCLUDE_DIRS: Set[str] = {".git", ".github", "__pycache__", ".data", ".scripts", "vulnerabilities-workflow", ".vscode"}
EXCLUDE_FILES: Set[str] = {"README.md", "LICENSE", "generate_readme.py"}
DOC_TYPES: Dict[str, str] = {
"sec-design.md": "Security Design Review",
"sec-design-deep-analysis.md": "Security Design Review - Deep Analysis",
"threat-modeling.md": "Threat Modeling",
"attack-surface.md": "Attack Surface",
"attack-tree.md": "Attack Tree",
"mitigations.md": "Mitigation Strategies",
"vulnerabilities-workflow-1.md": "Vulnerabilities Workflow",
}
SPONSORSHIP = """## ⚠️ Known Limitations
- Documentation accuracy varies by model and project
- Some formatting issues exist (diagrams, tables)
- Security recommendations need expert validation
- Model responses may contain inaccuracies
- Documentation was generated based on the model's capabilities at the time of the **cut off date**
## 🤝 How to Contribute
Help us improve by:
1. Reviewing documentation and reporting inaccuracies
2. Suggesting better LLM models to test
3. Recommending documentation improvements
4. Sharing which document types you find most useful
### Reporting Issues
Create issues with:
- Label `model-evaluation` for LLM model feedback
- Label `doc-type-feedback` for document type evaluation
- Label `content` for accuracy concerns
- Label `formatting` for layout problems
[Create New Issue](https://github.com/xvnpw/sec-docs/issues/new)
## 💝 Support the Project
This research requires access to various AI models and computing resources. Support our work through:
- [GitHub Sponsors](https://github.com/sponsors/xvnpw)
- Contributing feedback and improvements
Your support helps us evaluate more models and improve documentation quality for the OSS community.
"""
INTRODUCTION = """# sec-docs
An experimental project using LLM technology to generate security documentation for Open Source Software (OSS) projects.
## 🔍 Project Overview
We're exploring how different LLM models can help create comprehensive security documentation including:
- Attack surface analysis
- Attack trees
- Security design reviews
- Threat modeling
## 🧪 Experimental Status
This is an early-phase research project currently testing:
- Gemini 2.0 Flash Thinking Experimental - model cut off date: **end of August 2024** (updated 21.01.2025)
- Gemini 2.0 Pro Experimental - model cut off date: **end of August 2024**
- Other LLM models (planned)
### News
- 2025-02-19: Finished re-processing all projects using latest Gemini 2.0 Pro Experimental model
- 2025-02-04: Finished re-processing all projects using latest Gemini 2.0 Flash Thinking Experimental model, updated at 21.01.2025
- 2025-02-02: Added mitigations using Gemini 2.0 Flash Thinking Experimental - [blog](https://xvnpw.github.io/posts/forget-threats-mitigations-are-all-you-really-need/)
- 2025-01-22: Added analysis for temperature 0 using Gemini 2.0 Flash Thinking Experimental
- 2025-01-10: Deep analysis finished for all projects using Gemini 2.0 Flash Thinking Experimental - [blog](https://xvnpw.github.io/posts/ai-security-analyzer-deep-analysis-mode/)
- 2025-01-01: Processed 1000+ projects ([list](.data/origin_repos.txt)) using Gemini 2.0 Flash Thinking Experimental - [blog](https://xvnpw.github.io/posts/scaling-threat-modeling-with-ai/)
## Help Us Evaluate!
We need community help to determine:
1. Which LLM models produce the most accurate security documentation
2. Which types of security documents are most valuable
3. How to improve documentation quality and reliability
## How to Navigate This Repository
**sec-docs** is organized by programming language, with folders for each major OSS project. Each project contains subfolders with detailed analyses performed at a specific date using a certain LLM model.
### Current Projects
"""
class ProjectData:
"""Class to store and manage project data and generate README files."""
def __init__(self, base_dir: Path = Path(".")):
self.base_dir = base_dir
self.languages: Dict[str, Dict[str, Dict[str, Dict[str, List[str]]]]] = {}
def get_github_link(self, project_dir: Path) -> str:
"""Read GitHub repository URL from project config file."""
config_path = project_dir / "config.json"
if config_path.is_file():
try:
with open(config_path, "r") as config_file:
config = json.load(config_file)
repo_url = config.get("repo_url", "")
if repo_url and not repo_url.startswith("https://github.com/"):
return f"https://github.com/{repo_url}"
return repo_url
except (json.JSONDecodeError, IOError) as e:
print(f"Error reading config file {config_path}: {e}")
return ""
def get_metadata(self, version_dir: Path) -> Dict[str, Any]:
"""Read and return metadata from output-metadata.json."""
metadata_path = version_dir / "output-metadata.json"
if metadata_path.is_file():
try:
with open(metadata_path, "r") as metadata_file:
return json.load(metadata_file)
except (json.JSONDecodeError, IOError) as e:
print(f"Error reading metadata file {metadata_path}: {e}")
return {}
def process_version(
self, language: str, owner: str, project: str, version: str, version_dir: Path
) -> Tuple[str, str]:
"""Process a version directory and generate README lines."""
metadata = self.get_metadata(version_dir)
analysis_date = version[:10]
model_name = metadata.get("agent_model", "Unknown Model")
analyzer_args = metadata.get("analyzer_args", "")
deep_analysis = "✅" if "deep-analysis" in analyzer_args else ""
agent_temperature = metadata.get("agent_temperature", 0)
# Extract secondary agent model from analyzer_args if present
secondary_model = None
if "--secondary-agent-model" in analyzer_args:
parts = analyzer_args.split("--secondary-agent-model")
if len(parts) > 1:
model_parts = parts[1].strip().split()
if model_parts:
secondary_model = model_parts[0]
# Extract secondary agent temperature from analyzer_args if present
secondary_temp = None
if "--secondary-agent-temperature" in analyzer_args:
parts = analyzer_args.split("--secondary-agent-temperature")
if len(parts) > 1:
temp_parts = parts[1].strip().split()
if temp_parts:
try:
secondary_temp = temp_parts[0]
except (ValueError, IndexError):
pass
# Combine models and temperatures if secondary exists
if secondary_model:
model_name = f"{model_name} / {secondary_model}"
if secondary_temp:
agent_temperature = f"{agent_temperature} / {secondary_temp}"
# Project README doc links (relative to project directory)
project_doc_links = []
# Language README doc links (relative to language directory)
language_doc_links = []
for doc_file, doc_name in DOC_TYPES.items():
doc_path = version_dir / doc_file
if doc_path.exists():
project_doc_links.append(f"[{doc_name}]({version}/{doc_file})")
language_doc_links.append(f"[{doc_name}]({owner}/{project}/{version}/{doc_file})")
project_line = f"| {analysis_date} | {model_name} | {agent_temperature} | {deep_analysis} | {', '.join(project_doc_links)} |"
language_line = f"| {analysis_date} | {model_name} | {agent_temperature} | {deep_analysis} | {', '.join(language_doc_links)} |"
return project_line, language_line
def collect_data(self) -> None:
"""Collect all project data from the directory structure."""
top_dirs = [d for d in self.base_dir.iterdir() if d.is_dir() and d.name not in EXCLUDE_DIRS]
for language_dir in top_dirs:
language = language_dir.name
self.languages[language] = {}
owners = [o for o in language_dir.iterdir() if o.is_dir()]
for owner_dir in owners:
owner = owner_dir.name
self.languages[language][owner] = {}
projects = [p for p in owner_dir.iterdir() if p.is_dir()]
for project_dir in projects:
project = project_dir.name
self.languages[language][owner][project] = {"project_lines": [], "language_lines": []}
versions = sorted(
[v for v in project_dir.iterdir() if v.is_dir()], key=lambda x: x.name, reverse=True
)
for version_dir in versions:
version = version_dir.name
project_line, language_line = self.process_version(
language, owner, project, version, version_dir
)
self.languages[language][owner][project]["project_lines"].append(project_line)
self.languages[language][owner][project]["language_lines"].append(language_line)
# Generate project README
self.generate_project_readme(language, owner, project)
# Generate language README
self.generate_language_readme(language)
# Generate main README
self.generate_main_readme()
def generate_project_readme(self, language: str, owner: str, project: str) -> None:
"""Generate README file for a specific project."""
readme_lines = [f"# {project.title()} Analysis"]
project_dir = self.base_dir / language / owner / project
github_link = self.get_github_link(project_dir)
if github_link:
readme_lines.append(f"\n[GitHub Repository]({github_link})\n")
readme_lines.append("| Analysis Date | Model | T | Deep Analysis | Documents |")
readme_lines.append("|---------------|-------|---|:-------------:|-----------|")
readme_lines.extend(self.languages[language][owner][project]["project_lines"])
readme_path = project_dir / "README.md"
with open(readme_path, "w") as f:
f.write("\n".join(readme_lines))
def generate_language_readme(self, language: str) -> None:
"""Generate README file for a language directory."""
readme_lines = [f"# {language.title()} Projects"]
readme_lines.append("| Project | Analysis Date | Model | T | Deep Analysis | Documentation |")
readme_lines.append("|---------|---------------|-------|---|:-------------:|---------------|")
for owner in self.languages[language]:
for project, data in self.languages[language][owner].items():
project_dir = self.base_dir / language / owner / project
github_link = self.get_github_link(project_dir)
project_name = f"[{owner}/{project}]({owner}/{project}/)"
if github_link:
project_name += f" ([GitHub]({github_link}))"
for version_line in data["language_lines"]:
parts = version_line.split("|")
if len(parts) >= 4:
date = parts[1].strip()
model = parts[2].strip()
temperature = parts[3].strip()
deep_analysis = parts[4].strip()
docs = parts[5].strip()
readme_lines.append(
f"| {project_name} | {date} | {model} | {temperature} | {deep_analysis} | {docs} |"
)
readme_path = self.base_dir / language / "README.md"
with open(readme_path, "w") as f:
f.write("\n".join(readme_lines))
def generate_main_readme(self) -> None:
"""Generate the main README file for the repository."""
readme_lines = [INTRODUCTION]
for language in self.languages:
readme_lines.append(f"\n### [{language.title()}]({language}/)\n")
for owner in self.languages[language]:
for project in self.languages[language][owner]:
project_dir = self.base_dir / language / owner / project
github_link = self.get_github_link(project_dir)
github_part = f"[GitHub]({github_link})" if github_link else ""
readme_lines.append(f"- [{owner}/{project}]({language}/{owner}/{project}) ({github_part})")
readme_lines.append("\n" + SPONSORSHIP)
readme_path = self.base_dir / "README.md"
with open(readme_path, "w") as f:
f.write("\n".join(readme_lines))
def main() -> None:
"""Main function to generate all README files."""
project_data = ProjectData()
project_data.collect_data()
if __name__ == "__main__":
main()