Merge pull request #203 from sbdchd/steve/type-toplevel

add mypy typing
scrapinghub · Dec 19, 2022 · 60bc7eb · 60bc7eb
2 parents d869ef5 + 945a7e6
commit 60bc7eb
Show file tree

Hide file tree

Showing 27 changed files with 166 additions and 98 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ['3.6', '3.7', '3.8', '3.9']
+        python-version: ['3.7', '3.8', '3.9', '3.10', '3.11']
 
     steps:
     - uses: actions/checkout@v2

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -9,3 +9,8 @@ repos:
         language_version: python3
     repo: https://github.com/PyCQA/isort
     rev: 5.10.1
+  - hooks:
+    - id: mypy
+      additional_dependencies: [types-requests, types-mock, types-six, lxml-stubs]
+    repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v0.982
diff --git a/extruct/_extruct.py b/extruct/_extruct.py
@@ -1,5 +1,8 @@
+from __future__ import annotations
+
 import logging
 import warnings
+from typing import Any, Callable
 
 from extruct.dublincore import DublinCoreExtractor
 from extruct.jsonld import JsonLdExtractor
@@ -15,17 +18,17 @@
 
 
 def extract(
-    htmlstring,
-    base_url=None,
-    encoding="UTF-8",
-    syntaxes=SYNTAXES,
-    errors="strict",
-    uniform=False,
-    return_html_node=False,
-    schema_context="http://schema.org",
-    with_og_array=False,
-    **kwargs
-):
+    htmlstring: str | bytes,
+    base_url: str | None = None,
+    encoding: str = "UTF-8",
+    syntaxes: list[str] = SYNTAXES,
+    errors: str = "strict",
+    uniform: bool = False,
+    return_html_node: bool = False,
+    schema_context: str = "http://schema.org",
+    with_og_array: bool = False,
+    url: str | None = None,  # deprecated
+) -> dict[str, list[dict[str, Any]]]:
     """
     htmlstring: string with valid html document;
     base_url: base url of the html document
@@ -44,15 +47,13 @@ def extract(
                       The feature is supported only by microdata syntax.
                       Each node is of `lxml.etree.Element` type.
     schema_context: schema's context for current page"""
-    if base_url is None and "url" in kwargs:
+    if base_url is None and url is not None:
         warnings.warn(
             '"url" argument is deprecated, please use "base_url"',
             DeprecationWarning,
             stacklevel=2,
         )
-        base_url = kwargs.pop("url")
-    if kwargs:
-        raise TypeError("Unexpected keyword arguments")
+        base_url = url
     if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)):
         raise ValueError(
             "syntaxes must be a list with any or all (default) of"
@@ -112,7 +113,7 @@ def extract(
                 tree,
             )
         )
-    output = {}
+    output: dict[str, list[dict[str, Any]]] = {}
     for syntax, extract, document in processors:
         try:
             output[syntax] = list(extract(document, base_url=base_url))
@@ -124,7 +125,9 @@ def extract(
             if errors == "strict":
                 raise
     if uniform:
-        uniform_processors = []
+        uniform_processors: list[
+            tuple[str, Callable[..., Any], list[Any], str | None]
+        ] = []
         if "microdata" in syntaxes:
             uniform_processors.append(
                 (
@@ -162,14 +165,14 @@ def extract(
                 )
             )
 
-        for syntax, uniform, raw, schema_context in uniform_processors:
+        for syntax, uniform_fn, raw, schema_ctx in uniform_processors:
             try:
                 if syntax == "opengraph":
-                    output[syntax] = uniform(raw, with_og_array=with_og_array)
+                    output[syntax] = uniform_fn(raw, with_og_array=with_og_array)
                 elif syntax == "dublincore":
-                    output[syntax] = uniform(raw)
+                    output[syntax] = uniform_fn(raw)
                 else:
-                    output[syntax] = uniform(raw, schema_context)
+                    output[syntax] = uniform_fn(raw, schema_ctx)
             except Exception as e:
                 if errors == "ignore":
                     output[syntax] = []

diff --git a/extruct/dublincore.py b/extruct/dublincore.py
@@ -1,3 +1,4 @@
+# mypy: disallow_untyped_defs=False
 import re
 
 from w3lib.html import strip_html5_whitespace
@@ -110,7 +111,7 @@ def get_lower_attrib(name):
     return re.sub(r".*\.", "", name).lower()
 
 
-class DublinCoreExtractor(object):
+class DublinCoreExtractor:
     """DublinCore extractor following extruct API."""
 
     def extract(self, htmlstring, base_url=None, encoding="UTF-8"):

diff --git a/extruct/jsonld.py b/extruct/jsonld.py
@@ -1,3 +1,4 @@
+# mypy: disallow_untyped_defs=False
 # -*- coding: utf-8 -*-
 """
 JSON-LD extractor
@@ -14,7 +15,7 @@
 HTML_OR_JS_COMMENTLINE = re.compile(r"^\s*(//.*|<!--.*-->)")
 
 
-class JsonLdExtractor(object):
+class JsonLdExtractor:
     _xp_jsonld = lxml.etree.XPath(
         'descendant-or-self::script[@type="application/ld+json"]'
     )
@@ -26,7 +27,7 @@ def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
     def extract_items(self, document, base_url=None):
         return [
             item
-            for items in map(self._extract_items, self._xp_jsonld(document))
+            for items in map(self._extract_items, self._xp_jsonld(document))  # type: ignore[arg-type]
             if items
             for item in items
             if item

diff --git a/extruct/microformat.py b/extruct/microformat.py
@@ -1,7 +1,8 @@
+# mypy: disallow_untyped_defs=False
 import mf2py
 
 
-class MicroformatExtractor(object):
+class MicroformatExtractor:
     def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
         return list(self.extract_items(htmlstring, base_url=base_url))
 

diff --git a/extruct/opengraph.py b/extruct/opengraph.py
@@ -1,3 +1,4 @@
+# mypy: disallow_untyped_defs=False
 import re
 
 from extruct.utils import parse_html
@@ -15,7 +16,7 @@
 }
 
 
-class OpenGraphExtractor(object):
+class OpenGraphExtractor:
     """OpenGraph extractor following extruct API."""
 
     def extract(self, htmlstring, base_url=None, encoding="UTF-8"):

diff --git a/extruct/rdfa.py b/extruct/rdfa.py
@@ -1,3 +1,4 @@
+# mypy: disallow_untyped_defs=False
 # -*- coding: utf-8 -*-
 """
 RDFa extractor
@@ -16,7 +17,7 @@
 from pyRdfa import pyRdfa as PyRdfa
 from pyRdfa.initialcontext import initial_context
 from rdflib import Graph
-from rdflib import logger as rdflib_logger
+from rdflib import logger as rdflib_logger  # type: ignore[no-redef]
 
 from extruct.utils import parse_xmldom_html
 
@@ -37,7 +38,7 @@
 )
 
 
-class RDFaExtractor(object):
+class RDFaExtractor:
     def _replaceNS(self, prop, html_element, head_element):
         """Expand namespace to match with returned json (e.g.: og -> 'http://ogp.me/ns#')"""
 

diff --git a/extruct/tool.py b/extruct/tool.py
@@ -1,5 +1,8 @@
+from __future__ import annotations
+
 import argparse
 import json
+from typing import Any
 
 import requests
 
@@ -8,14 +11,17 @@
 
 
 def metadata_from_url(
-    url,
-    syntaxes=SYNTAXES,
-    uniform=False,
-    schema_context="http://schema.org",
-    errors="strict",
-):
+    url: str,
+    syntaxes: list[str] = SYNTAXES,
+    uniform: bool = False,
+    schema_context: str = "http://schema.org",
+    errors: str = "strict",
+) -> dict[str, Any]:
     resp = requests.get(url, timeout=30)
-    result = {"url": url, "status": "{} {}".format(resp.status_code, resp.reason)}
+    result: dict[str, Any] = {
+        "url": url,
+        "status": "{} {}".format(resp.status_code, resp.reason),
+    }
     try:
         resp.raise_for_status()
     except requests.exceptions.HTTPError:
@@ -33,7 +39,7 @@ def metadata_from_url(
     return result
 
 
-def main(args=None):
+def main(args: Any | None = None) -> Any:
     parser = argparse.ArgumentParser(prog="extruct", description=__doc__)
     arg = parser.add_argument
     arg("url", help="The target URL")
@@ -51,7 +57,7 @@ def main(args=None):
         default=False,
         help="""If True uniform output format of all syntaxes to a list of dicts.
                 Returned dicts structure:
-                {'@context': 'http://example.com', 
+                {'@context': 'http://example.com',
                  '@type': 'example_type',
                  /* All other the properties in keys here */
                  }""",

diff --git a/extruct/uniform.py b/extruct/uniform.py
@@ -1,4 +1,6 @@
+# mypy: disallow_untyped_defs=False
 import copy
+from typing import Any
 
 from six.moves.urllib.parse import urljoin, urlparse
 
@@ -10,7 +12,7 @@ def _uopengraph(extracted, with_og_array=False):
     for obj in extracted:
         # In order of appearance in the page
         properties = list(obj["properties"])
-        flattened = {}
+        flattened: dict[Any, Any] = {}
 
         for k, v in properties:
             if k not in flattened.keys():

diff --git a/extruct/utils.py b/extruct/utils.py
@@ -1,3 +1,4 @@
+# mypy: disallow_untyped_defs=False
 # -*- coding: utf-8 -*-
 import lxml.html
 

diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py
@@ -1,3 +1,4 @@
+# mypy: disallow_untyped_defs=False
 """
 HTML Microdata parser
 
@@ -9,8 +10,11 @@
 
 """
 
+from __future__ import annotations
+
 import collections
 from functools import partial
+from typing import Any, Set
 
 try:
     from urlparse import urljoin
@@ -43,7 +47,7 @@
 )
 
 
-class LxmlMicrodataExtractor(object):
+class LxmlMicrodataExtractor:
     # iterate in document order (used below for fast get_docid)
     _xp_item = lxml.etree.XPath("descendant-or-self::*[@itemscope]")
     _xp_prop = lxml.etree.XPath(
@@ -70,14 +74,14 @@ def extract(self, htmlstring, base_url=None, encoding="UTF-8"):
 
     def extract_items(self, document, base_url):
         itemids = self._build_itemids(document)
-        items_seen = set()
+        items_seen: Set[Any] = set()
         return [
             item
             for item in (
                 self._extract_item(
                     it, items_seen=items_seen, base_url=base_url, itemids=itemids
                 )
-                for it in self._xp_item(document)
+                for it in self._xp_item(document)  # type: ignore[union-attr]
             )
             if item
         ]
@@ -88,7 +92,7 @@ def get_docid(self, node, itemids):
     def _build_itemids(self, document):
         """Build itemids for a fast get_docid implementation. Use document order."""
         root = document.getroottree().getroot()
-        return {node: idx + 1 for idx, node in enumerate(self._xp_item(root))}
+        return {node: idx + 1 for idx, node in enumerate(self._xp_item(root))}  # type: ignore[arg-type]
 
     def _extract_item(self, node, items_seen, base_url, itemids):
         itemid = self.get_docid(node, itemids)
@@ -160,7 +164,7 @@ def _extract_item(self, node, items_seen, base_url, itemids):
         return item
 
     def _extract_properties(self, node, items_seen, base_url, itemids):
-        for prop in self._xp_prop(node):
+        for prop in self._xp_prop(node):  # type: ignore[union-attr]
             for p, v in self._extract_property(
                 prop, items_seen=items_seen, base_url=base_url, itemids=itemids
             ):