Skip to content

Commit

Permalink
Merge pull request #203 from sbdchd/steve/type-toplevel
Browse files Browse the repository at this point in the history
add mypy typing
  • Loading branch information
BurnzZ authored Dec 19, 2022
2 parents d869ef5 + 945a7e6 commit 60bc7eb
Show file tree
Hide file tree
Showing 27 changed files with 166 additions and 98 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ['3.6', '3.7', '3.8', '3.9']
python-version: ['3.7', '3.8', '3.9', '3.10', '3.11']

steps:
- uses: actions/checkout@v2
Expand Down
5 changes: 5 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,8 @@ repos:
language_version: python3
repo: https://github.com/PyCQA/isort
rev: 5.10.1
- hooks:
- id: mypy
additional_dependencies: [types-requests, types-mock, types-six, lxml-stubs]
repo: https://github.com/pre-commit/mirrors-mypy
rev: v0.982
45 changes: 24 additions & 21 deletions extruct/_extruct.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from __future__ import annotations

import logging
import warnings
from typing import Any, Callable

from extruct.dublincore import DublinCoreExtractor
from extruct.jsonld import JsonLdExtractor
Expand All @@ -15,17 +18,17 @@


def extract(
htmlstring,
base_url=None,
encoding="UTF-8",
syntaxes=SYNTAXES,
errors="strict",
uniform=False,
return_html_node=False,
schema_context="http://schema.org",
with_og_array=False,
**kwargs
):
htmlstring: str | bytes,
base_url: str | None = None,
encoding: str = "UTF-8",
syntaxes: list[str] = SYNTAXES,
errors: str = "strict",
uniform: bool = False,
return_html_node: bool = False,
schema_context: str = "http://schema.org",
with_og_array: bool = False,
url: str | None = None, # deprecated
) -> dict[str, list[dict[str, Any]]]:
"""
htmlstring: string with valid html document;
base_url: base url of the html document
Expand All @@ -44,15 +47,13 @@ def extract(
The feature is supported only by microdata syntax.
Each node is of `lxml.etree.Element` type.
schema_context: schema's context for current page"""
if base_url is None and "url" in kwargs:
if base_url is None and url is not None:
warnings.warn(
'"url" argument is deprecated, please use "base_url"',
DeprecationWarning,
stacklevel=2,
)
base_url = kwargs.pop("url")
if kwargs:
raise TypeError("Unexpected keyword arguments")
base_url = url
if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)):
raise ValueError(
"syntaxes must be a list with any or all (default) of"
Expand Down Expand Up @@ -112,7 +113,7 @@ def extract(
tree,
)
)
output = {}
output: dict[str, list[dict[str, Any]]] = {}
for syntax, extract, document in processors:
try:
output[syntax] = list(extract(document, base_url=base_url))
Expand All @@ -124,7 +125,9 @@ def extract(
if errors == "strict":
raise
if uniform:
uniform_processors = []
uniform_processors: list[
tuple[str, Callable[..., Any], list[Any], str | None]
] = []
if "microdata" in syntaxes:
uniform_processors.append(
(
Expand Down Expand Up @@ -162,14 +165,14 @@ def extract(
)
)

for syntax, uniform, raw, schema_context in uniform_processors:
for syntax, uniform_fn, raw, schema_ctx in uniform_processors:
try:
if syntax == "opengraph":
output[syntax] = uniform(raw, with_og_array=with_og_array)
output[syntax] = uniform_fn(raw, with_og_array=with_og_array)
elif syntax == "dublincore":
output[syntax] = uniform(raw)
output[syntax] = uniform_fn(raw)
else:
output[syntax] = uniform(raw, schema_context)
output[syntax] = uniform_fn(raw, schema_ctx)
except Exception as e:
if errors == "ignore":
output[syntax] = []
Expand Down
3 changes: 2 additions & 1 deletion extruct/dublincore.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# mypy: disallow_untyped_defs=False
import re

from w3lib.html import strip_html5_whitespace
Expand Down Expand Up @@ -110,7 +111,7 @@ def get_lower_attrib(name):
return re.sub(r".*\.", "", name).lower()


class DublinCoreExtractor(object):
class DublinCoreExtractor:
"""DublinCore extractor following extruct API."""

def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
Expand Down
5 changes: 3 additions & 2 deletions extruct/jsonld.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# mypy: disallow_untyped_defs=False
# -*- coding: utf-8 -*-
"""
JSON-LD extractor
Expand All @@ -14,7 +15,7 @@
HTML_OR_JS_COMMENTLINE = re.compile(r"^\s*(//.*|<!--.*-->)")


class JsonLdExtractor(object):
class JsonLdExtractor:
_xp_jsonld = lxml.etree.XPath(
'descendant-or-self::script[@type="application/ld+json"]'
)
Expand All @@ -26,7 +27,7 @@ def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
def extract_items(self, document, base_url=None):
return [
item
for items in map(self._extract_items, self._xp_jsonld(document))
for items in map(self._extract_items, self._xp_jsonld(document)) # type: ignore[arg-type]
if items
for item in items
if item
Expand Down
3 changes: 2 additions & 1 deletion extruct/microformat.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# mypy: disallow_untyped_defs=False
import mf2py


class MicroformatExtractor(object):
class MicroformatExtractor:
def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
return list(self.extract_items(htmlstring, base_url=base_url))

Expand Down
3 changes: 2 additions & 1 deletion extruct/opengraph.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# mypy: disallow_untyped_defs=False
import re

from extruct.utils import parse_html
Expand All @@ -15,7 +16,7 @@
}


class OpenGraphExtractor(object):
class OpenGraphExtractor:
"""OpenGraph extractor following extruct API."""

def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
Expand Down
5 changes: 3 additions & 2 deletions extruct/rdfa.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# mypy: disallow_untyped_defs=False
# -*- coding: utf-8 -*-
"""
RDFa extractor
Expand All @@ -16,7 +17,7 @@
from pyRdfa import pyRdfa as PyRdfa
from pyRdfa.initialcontext import initial_context
from rdflib import Graph
from rdflib import logger as rdflib_logger
from rdflib import logger as rdflib_logger # type: ignore[no-redef]

from extruct.utils import parse_xmldom_html

Expand All @@ -37,7 +38,7 @@
)


class RDFaExtractor(object):
class RDFaExtractor:
def _replaceNS(self, prop, html_element, head_element):
"""Expand namespace to match with returned json (e.g.: og -> 'http://ogp.me/ns#')"""

Expand Down
24 changes: 15 additions & 9 deletions extruct/tool.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from __future__ import annotations

import argparse
import json
from typing import Any

import requests

Expand All @@ -8,14 +11,17 @@


def metadata_from_url(
url,
syntaxes=SYNTAXES,
uniform=False,
schema_context="http://schema.org",
errors="strict",
):
url: str,
syntaxes: list[str] = SYNTAXES,
uniform: bool = False,
schema_context: str = "http://schema.org",
errors: str = "strict",
) -> dict[str, Any]:
resp = requests.get(url, timeout=30)
result = {"url": url, "status": "{} {}".format(resp.status_code, resp.reason)}
result: dict[str, Any] = {
"url": url,
"status": "{} {}".format(resp.status_code, resp.reason),
}
try:
resp.raise_for_status()
except requests.exceptions.HTTPError:
Expand All @@ -33,7 +39,7 @@ def metadata_from_url(
return result


def main(args=None):
def main(args: Any | None = None) -> Any:
parser = argparse.ArgumentParser(prog="extruct", description=__doc__)
arg = parser.add_argument
arg("url", help="The target URL")
Expand All @@ -51,7 +57,7 @@ def main(args=None):
default=False,
help="""If True uniform output format of all syntaxes to a list of dicts.
Returned dicts structure:
{'@context': 'http://example.com',
{'@context': 'http://example.com',
'@type': 'example_type',
/* All other the properties in keys here */
}""",
Expand Down
4 changes: 3 additions & 1 deletion extruct/uniform.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# mypy: disallow_untyped_defs=False
import copy
from typing import Any

from six.moves.urllib.parse import urljoin, urlparse

Expand All @@ -10,7 +12,7 @@ def _uopengraph(extracted, with_og_array=False):
for obj in extracted:
# In order of appearance in the page
properties = list(obj["properties"])
flattened = {}
flattened: dict[Any, Any] = {}

for k, v in properties:
if k not in flattened.keys():
Expand Down
1 change: 1 addition & 0 deletions extruct/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# mypy: disallow_untyped_defs=False
# -*- coding: utf-8 -*-
import lxml.html

Expand Down
14 changes: 9 additions & 5 deletions extruct/w3cmicrodata.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# mypy: disallow_untyped_defs=False
"""
HTML Microdata parser
Expand All @@ -9,8 +10,11 @@
"""

from __future__ import annotations

import collections
from functools import partial
from typing import Any, Set

try:
from urlparse import urljoin
Expand Down Expand Up @@ -43,7 +47,7 @@
)


class LxmlMicrodataExtractor(object):
class LxmlMicrodataExtractor:
# iterate in document order (used below for fast get_docid)
_xp_item = lxml.etree.XPath("descendant-or-self::*[@itemscope]")
_xp_prop = lxml.etree.XPath(
Expand All @@ -70,14 +74,14 @@ def extract(self, htmlstring, base_url=None, encoding="UTF-8"):

def extract_items(self, document, base_url):
itemids = self._build_itemids(document)
items_seen = set()
items_seen: Set[Any] = set()
return [
item
for item in (
self._extract_item(
it, items_seen=items_seen, base_url=base_url, itemids=itemids
)
for it in self._xp_item(document)
for it in self._xp_item(document) # type: ignore[union-attr]
)
if item
]
Expand All @@ -88,7 +92,7 @@ def get_docid(self, node, itemids):
def _build_itemids(self, document):
"""Build itemids for a fast get_docid implementation. Use document order."""
root = document.getroottree().getroot()
return {node: idx + 1 for idx, node in enumerate(self._xp_item(root))}
return {node: idx + 1 for idx, node in enumerate(self._xp_item(root))} # type: ignore[arg-type]

def _extract_item(self, node, items_seen, base_url, itemids):
itemid = self.get_docid(node, itemids)
Expand Down Expand Up @@ -160,7 +164,7 @@ def _extract_item(self, node, items_seen, base_url, itemids):
return item

def _extract_properties(self, node, items_seen, base_url, itemids):
for prop in self._xp_prop(node):
for prop in self._xp_prop(node): # type: ignore[union-attr]
for p, v in self._extract_property(
prop, items_seen=items_seen, base_url=base_url, itemids=itemids
):
Expand Down
Loading

0 comments on commit 60bc7eb

Please sign in to comment.