Skip to content

Commit

Permalink
scrape_docs: Fix h1 exclusion and refactor
Browse files Browse the repository at this point in the history
- h1 tag shouldn't be excluded in output
- Refactor for readability: Add a separate function for finding "how
  many indentations" are required.
  • Loading branch information
JOJ0 committed Oct 15, 2024
1 parent 612298c commit bc60b17
Showing 1 changed file with 22 additions and 9 deletions.
31 changes: 22 additions & 9 deletions scrape_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,24 @@ def scrape(output, url):
The default output format is "csv", which gives a two column CSV table
containing restructuredText formatted hyperlinks and a headline.
'''
def get_indentation_levels(heading_tags, heading_tag):
"""Returns how many indentation levels are required depending on the
passed heading tag
h1 is no indentation,
h2 is one indentation level,
h3 is two, and so on...
"""
for h in heading_tags:
if heading_tag == h:
return int(heading_tag[-1]) - 1
return 0

chapter = url
apidoc = requests.get(chapter).text
soup = BeautifulSoup(apidoc, 'html.parser')

any_heading_tag = ['h2', 'h3', 'h4', 'h5', 'h6']
any_heading_tag = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
elements = soup.find_all([*any_heading_tag, 'a'],)

for e in elements:
Expand All @@ -43,18 +56,18 @@ def scrape(output, url):
link = e['href']
if output == 'debug':
print(f'Element text:\t{e.text}\nLink/Anchor:\t{link}')
indent_count = get_indentation_levels(any_heading_tag,
e.parent.name)
print(f'Indentations:\t{indent_count}')
if output in ['rst', 'csv']:
parts = chapter.split('/admin_api/')
fulllink = f'{parts[0]}/admin_api/{parts[1]}{link}'
indent_count = get_indentation_levels(any_heading_tag,
e.parent.name)
spacing = ''
for h in any_heading_tag:
if e.parent.name == h:
# h2 is no spacing (decrease by 2),
# h3 is 2 spaces, h4 is 4....
# two literal spaces are replaced by '|indent| '
spacing_count = int(e.parent.name[-1]) - 2
for val in range(0, spacing_count * 2):
spacing += '|indent| '
for val in range(0, indent_count):
# '|indent| ' represents one indentation level
spacing += '|indent| '
rst = f'{spacing}`{e.text} <{fulllink}>`_'
if output == 'csv':
left_col = f'"{rst}"'
Expand Down

0 comments on commit bc60b17

Please sign in to comment.