-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_descriptions.py
132 lines (118 loc) · 4.85 KB
/
generate_descriptions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import json
import requests
from bs4 import BeautifulSoup
from openai import OpenAI # Import the new OpenAI client
import logging
import time
from dotenv import load_dotenv
import os # Import os to access environment variables
load_dotenv()
# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# Initialize the OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def read_urls_from_file(file_path):
"""Read URLs from a text file, one URL per line."""
try:
with open(file_path, 'r') as file:
urls = [line.strip() for line in file if line.strip()]
logger.info(f"Successfully read {len(urls)} URLs from {file_path}")
return urls
except Exception as e:
logger.error(f"Error reading URLs from {file_path}: {e}")
return []
def fetch_page_content(url):
"""Helper function to fetch the content of a single page."""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
cookies = {
"example_cookie": "example_value" # Replace with actual cookies if needed
}
try:
response = requests.get(url, headers=headers, cookies=cookies, timeout=10)
response.raise_for_status() # Raise an error for bad status codes
return response.text
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching {url}: {e}")
if hasattr(e, 'response') and e.response is not None:
logger.error(f"Response content: {e.response.text}") # Log the response content
return ""
def extract_text_from_html(html_content):
"""Helper function to extract and clean text from HTML."""
soup = BeautifulSoup(html_content, "html.parser")
# Remove script and style elements
for script_or_style in soup(["script", "style"]):
script_or_style.decompose()
# Get text and clean it
text = soup.get_text(separator=" ", strip=True)
return text
def scrape_page(url):
"""Scrape the content of a webpage."""
try:
logger.info(f"Scraping {url}...")
html_content = fetch_page_content(url)
if html_content:
text = extract_text_from_html(html_content)
logger.debug(f"Scraped content (first 500 chars): {text[:500]}")
return text[:500] # Return the first 500 characters as a summary
else:
logger.warning(f"No content fetched for {url}")
return ""
except Exception as e:
logger.error(f"Error scraping {url}: {e}")
return ""
def generate_description(text):
"""Generate a one-sentence description using OpenAI."""
try:
logger.info("Generating description using OpenAI...")
response = client.chat.completions.create(
model="gpt-3.5-turbo", # Use the appropriate model
messages=[
{"role": "system", "content": "Summarize the following content in one sentence:"},
{"role": "user", "content": text}
],
max_tokens=50
)
description = response.choices[0].message.content.strip()
logger.debug(f"Generated description: {description}")
return description
except Exception as e:
logger.error(f"Error generating description: {e}")
return ""
def process_urls(urls):
"""Process each URL, scrape content, and generate descriptions."""
results = []
for url in urls:
logger.info(f"Processing {url}...")
text = scrape_page(url)
if text: # Only generate a description if scraping was successful
description = generate_description(text)
results.append({"url": url, "description": description})
else:
logger.warning(f"Skipping description generation for {url} due to scraping error.")
results.append({"url": url, "description": ""})
time.sleep(2) # Add a delay between requests to avoid overwhelming the server
return results
def save_to_json(data, output_file):
"""Save the results to a JSON file."""
try:
with open(output_file, 'w') as file:
json.dump(data, file, indent=4)
logger.info(f"Descriptions saved to {output_file}")
except Exception as e:
logger.error(f"Error saving to {output_file}: {e}")
if __name__ == "__main__":
# Input and output file paths
input_file = "urls_docs.txt"
output_file = "docs_links_with_descriptions.json"
# Read URLs from the input file
urls = read_urls_from_file(input_file)
if not urls:
logger.error("No URLs found in the input file. Exiting.")
exit(1)
# Process URLs and generate descriptions
results = process_urls(urls)
# Save the results to a JSON file
save_to_json(results, output_file)