From 14ad0ad8c129777637b4d9b7244f78c068966a7f Mon Sep 17 00:00:00 2001 From: Steve Dignam Date: Mon, 21 Nov 2022 21:18:41 -0500 Subject: [PATCH 01/11] add mypy typing --- .pre-commit-config.yaml | 5 ++++ extruct/_extruct.py | 37 +++++++++++++++------------- extruct/dublincore.py | 3 ++- extruct/jsonld.py | 5 ++-- extruct/microformat.py | 3 ++- extruct/opengraph.py | 3 ++- extruct/rdfa.py | 5 ++-- extruct/tool.py | 23 +++++++++++------- extruct/uniform.py | 4 +++- extruct/utils.py | 1 + extruct/w3cmicrodata.py | 13 ++++++---- extruct/xmldom.py | 53 +++++++++++++++++++++-------------------- mypy.ini | 49 +++++++++++++++++++++++++++++++++++++ setup.py | 2 -- 14 files changed, 139 insertions(+), 67 deletions(-) create mode 100644 mypy.ini diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8c4a5775..87aeae69 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,3 +9,8 @@ repos: language_version: python3 repo: https://github.com/PyCQA/isort rev: 5.10.1 + - hooks: + - id: mypy + additional_dependencies: [types-requests, types-mock, types-six, lxml-stubs] + repo: https://github.com/pre-commit/mirrors-mypy + rev: v0.982 diff --git a/extruct/_extruct.py b/extruct/_extruct.py index 6ebae1cd..353a2500 100644 --- a/extruct/_extruct.py +++ b/extruct/_extruct.py @@ -1,4 +1,5 @@ import logging +from typing import Any, Callable import warnings from extruct.dublincore import DublinCoreExtractor @@ -15,17 +16,17 @@ def extract( - htmlstring, - base_url=None, - encoding="UTF-8", - syntaxes=SYNTAXES, - errors="strict", - uniform=False, - return_html_node=False, - schema_context="http://schema.org", - with_og_array=False, - **kwargs -): + htmlstring: str | bytes, + base_url: str | None = None, + encoding: str = "UTF-8", + syntaxes: list[str] = SYNTAXES, + errors: str = "strict", + uniform: bool = False, + return_html_node: bool = False, + schema_context: str = "http://schema.org", + with_og_array: bool = False, + **kwargs: None +) -> dict[str, list[dict[str, Any]]]: """ htmlstring: string with valid html document; base_url: base url of the html document @@ -112,7 +113,7 @@ def extract( tree, ) ) - output = {} + output: dict[str, list[dict[str, Any]]] = {} for syntax, extract, document in processors: try: output[syntax] = list(extract(document, base_url=base_url)) @@ -124,7 +125,9 @@ def extract( if errors == "strict": raise if uniform: - uniform_processors = [] + uniform_processors: list[ + tuple[str, Callable[..., Any], list[Any], str | None] + ] = [] if "microdata" in syntaxes: uniform_processors.append( ( @@ -162,14 +165,14 @@ def extract( ) ) - for syntax, uniform, raw, schema_context in uniform_processors: + for syntax, uniform_fn, raw, schema_ctx in uniform_processors: try: if syntax == "opengraph": - output[syntax] = uniform(raw, with_og_array=with_og_array) + output[syntax] = uniform_fn(raw, with_og_array=with_og_array) elif syntax == "dublincore": - output[syntax] = uniform(raw) + output[syntax] = uniform_fn(raw) else: - output[syntax] = uniform(raw, schema_context) + output[syntax] = uniform_fn(raw, schema_ctx) except Exception as e: if errors == "ignore": output[syntax] = [] diff --git a/extruct/dublincore.py b/extruct/dublincore.py index 35d6254f..f8f12abb 100644 --- a/extruct/dublincore.py +++ b/extruct/dublincore.py @@ -1,3 +1,4 @@ +# mypy: disallow_untyped_defs=False import re from w3lib.html import strip_html5_whitespace @@ -110,7 +111,7 @@ def get_lower_attrib(name): return re.sub(r".*\.", "", name).lower() -class DublinCoreExtractor(object): +class DublinCoreExtractor: """DublinCore extractor following extruct API.""" def extract(self, htmlstring, base_url=None, encoding="UTF-8"): diff --git a/extruct/jsonld.py b/extruct/jsonld.py index 250f639b..495ad07b 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -1,3 +1,4 @@ +# mypy: disallow_untyped_defs=False # -*- coding: utf-8 -*- """ JSON-LD extractor @@ -14,7 +15,7 @@ HTML_OR_JS_COMMENTLINE = re.compile(r"^\s*(//.*|)") -class JsonLdExtractor(object): +class JsonLdExtractor: _xp_jsonld = lxml.etree.XPath( 'descendant-or-self::script[@type="application/ld+json"]' ) @@ -26,7 +27,7 @@ def extract(self, htmlstring, base_url=None, encoding="UTF-8"): def extract_items(self, document, base_url=None): return [ item - for items in map(self._extract_items, self._xp_jsonld(document)) + for items in map(self._extract_items, self._xp_jsonld(document)) # type: ignore[arg-type] if items for item in items if item diff --git a/extruct/microformat.py b/extruct/microformat.py index 84c15473..dab6a114 100644 --- a/extruct/microformat.py +++ b/extruct/microformat.py @@ -1,7 +1,8 @@ +# mypy: disallow_untyped_defs=False import mf2py -class MicroformatExtractor(object): +class MicroformatExtractor: def extract(self, htmlstring, base_url=None, encoding="UTF-8"): return list(self.extract_items(htmlstring, base_url=base_url)) diff --git a/extruct/opengraph.py b/extruct/opengraph.py index 1cddc948..ea53ca1c 100644 --- a/extruct/opengraph.py +++ b/extruct/opengraph.py @@ -1,3 +1,4 @@ +# mypy: disallow_untyped_defs=False import re from extruct.utils import parse_html @@ -15,7 +16,7 @@ } -class OpenGraphExtractor(object): +class OpenGraphExtractor: """OpenGraph extractor following extruct API.""" def extract(self, htmlstring, base_url=None, encoding="UTF-8"): diff --git a/extruct/rdfa.py b/extruct/rdfa.py index 575fac97..273b0b57 100644 --- a/extruct/rdfa.py +++ b/extruct/rdfa.py @@ -1,3 +1,4 @@ +# mypy: disallow_untyped_defs=False # -*- coding: utf-8 -*- """ RDFa extractor @@ -16,7 +17,7 @@ from pyRdfa import pyRdfa as PyRdfa from pyRdfa.initialcontext import initial_context from rdflib import Graph -from rdflib import logger as rdflib_logger +from rdflib import logger as rdflib_logger # type: ignore[no-redef] from extruct.utils import parse_xmldom_html @@ -37,7 +38,7 @@ ) -class RDFaExtractor(object): +class RDFaExtractor: def _replaceNS(self, prop, html_element, head_element): """Expand namespace to match with returned json (e.g.: og -> 'http://ogp.me/ns#')""" diff --git a/extruct/tool.py b/extruct/tool.py index 6101edb7..04e3ed6f 100644 --- a/extruct/tool.py +++ b/extruct/tool.py @@ -1,5 +1,7 @@ +from __future__ import annotations import argparse import json +from typing import Any import requests @@ -8,14 +10,17 @@ def metadata_from_url( - url, - syntaxes=SYNTAXES, - uniform=False, - schema_context="http://schema.org", - errors="strict", -): + url: str, + syntaxes: list[str] = SYNTAXES, + uniform: bool = False, + schema_context: str = "http://schema.org", + errors: str = "strict", +) -> dict[str, Any]: resp = requests.get(url, timeout=30) - result = {"url": url, "status": "{} {}".format(resp.status_code, resp.reason)} + result: dict[str, Any] = { + "url": url, + "status": "{} {}".format(resp.status_code, resp.reason), + } try: resp.raise_for_status() except requests.exceptions.HTTPError: @@ -33,7 +38,7 @@ def metadata_from_url( return result -def main(args=None): +def main(args: Any | None = None) -> Any: parser = argparse.ArgumentParser(prog="extruct", description=__doc__) arg = parser.add_argument arg("url", help="The target URL") @@ -51,7 +56,7 @@ def main(args=None): default=False, help="""If True uniform output format of all syntaxes to a list of dicts. Returned dicts structure: - {'@context': 'http://example.com', + {'@context': 'http://example.com', '@type': 'example_type', /* All other the properties in keys here */ }""", diff --git a/extruct/uniform.py b/extruct/uniform.py index 538d494f..8c96c4ed 100644 --- a/extruct/uniform.py +++ b/extruct/uniform.py @@ -1,4 +1,6 @@ +# mypy: disallow_untyped_defs=False import copy +from typing import Any from six.moves.urllib.parse import urljoin, urlparse @@ -10,7 +12,7 @@ def _uopengraph(extracted, with_og_array=False): for obj in extracted: # In order of appearance in the page properties = list(obj["properties"]) - flattened = {} + flattened: dict[Any, Any] = {} for k, v in properties: if k not in flattened.keys(): diff --git a/extruct/utils.py b/extruct/utils.py index 0baaf946..b16f0f0e 100644 --- a/extruct/utils.py +++ b/extruct/utils.py @@ -1,3 +1,4 @@ +# mypy: disallow_untyped_defs=False # -*- coding: utf-8 -*- import lxml.html diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py index 4e38d467..dba198f3 100644 --- a/extruct/w3cmicrodata.py +++ b/extruct/w3cmicrodata.py @@ -1,3 +1,4 @@ +# mypy: disallow_untyped_defs=False """ HTML Microdata parser @@ -9,8 +10,10 @@ """ +from __future__ import annotations import collections from functools import partial +from typing import Any, Set try: from urlparse import urljoin @@ -43,7 +46,7 @@ ) -class LxmlMicrodataExtractor(object): +class LxmlMicrodataExtractor: # iterate in document order (used below for fast get_docid) _xp_item = lxml.etree.XPath("descendant-or-self::*[@itemscope]") _xp_prop = lxml.etree.XPath( @@ -70,14 +73,14 @@ def extract(self, htmlstring, base_url=None, encoding="UTF-8"): def extract_items(self, document, base_url): itemids = self._build_itemids(document) - items_seen = set() + items_seen: Set[Any] = set() return [ item for item in ( self._extract_item( it, items_seen=items_seen, base_url=base_url, itemids=itemids ) - for it in self._xp_item(document) + for it in self._xp_item(document) # type: ignore[union-attr] ) if item ] @@ -88,7 +91,7 @@ def get_docid(self, node, itemids): def _build_itemids(self, document): """Build itemids for a fast get_docid implementation. Use document order.""" root = document.getroottree().getroot() - return {node: idx + 1 for idx, node in enumerate(self._xp_item(root))} + return {node: idx + 1 for idx, node in enumerate(self._xp_item(root))} # type: ignore[arg-type] def _extract_item(self, node, items_seen, base_url, itemids): itemid = self.get_docid(node, itemids) @@ -160,7 +163,7 @@ def _extract_item(self, node, items_seen, base_url, itemids): return item def _extract_properties(self, node, items_seen, base_url, itemids): - for prop in self._xp_prop(node): + for prop in self._xp_prop(node): # type: ignore[union-attr] for p, v in self._extract_property( prop, items_seen=items_seen, base_url=base_url, itemids=itemids ): diff --git a/extruct/xmldom.py b/extruct/xmldom.py index 25920aad..8d477a58 100644 --- a/extruct/xmldom.py +++ b/extruct/xmldom.py @@ -1,3 +1,6 @@ +# mypy: disallow_untyped_defs=False +from __future__ import annotations + # -*- coding: utf-8 -*- from copy import copy, deepcopy from xml.dom import Node @@ -13,7 +16,7 @@ from lxml.html import HtmlElementClassLookup, HTMLParser -class DomElementUnicodeResult(object): +class DomElementUnicodeResult: CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE ELEMENT_NODE = Node.ELEMENT_NODE TEXT_NODE = Node.TEXT_NODE @@ -30,7 +33,7 @@ def data(self): raise RuntimeError -class DomTextNode(object): +class DomTextNode: CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE ELEMENT_NODE = Node.ELEMENT_NODE TEXT_NODE = Node.TEXT_NODE @@ -53,7 +56,7 @@ def lxmlDomNodeType(node): return Node.NOTATION_NODE -class DomHtmlMixin(object): +class DomHtmlMixin: CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE ELEMENT_NODE = Node.ELEMENT_NODE TEXT_NODE = Node.TEXT_NODE @@ -62,7 +65,7 @@ class DomHtmlMixin(object): @property def documentElement(self): - return self.getroottree().getroot() + return self.getroottree().getroot() # type: ignore[attr-defined] @property def nodeType(self): @@ -71,24 +74,24 @@ def nodeType(self): @property def nodeName(self): # FIXME: this is a simpification - return self.tag + return self.tag # type: ignore[attr-defined] @property def tagName(self): - return self.tag + return self.tag # type: ignore[attr-defined] @property def localName(self): - return self.xpath("local-name(.)") + return self.xpath("local-name(.)") # type: ignore[attr-defined] def hasAttribute(self, name): - return name in self.attrib + return name in self.attri # type: ignore[attr-defined] def getAttribute(self, name): - return self.get(name) + return self.get(name) # type: ignore[attr-defined] def setAttribute(self, name, value): - self.set(name, value) + self.set(name, value) # type: ignore[attr-defined] def cloneNode(self, deep): return deepcopy(self) if deep else copy(self) @@ -96,7 +99,7 @@ def cloneNode(self, deep): @property def attributes(self): attrs = {} - for name, value in self.attrib.items(): + for name, value in self.attrib.items(): # type: ignore[attr-defined] a = Attr(name) a.value = value attrs[name] = a @@ -104,11 +107,11 @@ def attributes(self): @property def parentNode(self): - return self.getparent() + return self.getparent() # type: ignore[attr-defined] @property def childNodes_xpath(self): - for n in self._xp_childrennodes(self): + for n in self._xp_childrennodes(self): # type: ignore[union-attr,arg-type] if isinstance(n, ElementBase): yield n @@ -118,24 +121,24 @@ def childNodes_xpath(self): if isinstance(n, _ElementUnicodeResult): n = DomElementUnicodeResult(n) else: - n.nodeType = Node.TEXT_NODE - n.data = n + n.nodeType = Node.TEXT_NODE # type: ignore[attr-defined] + n.data = n # type: ignore[attr-defined] yield n @property def childNodes(self): - if self.text: - yield DomTextNode(self.text) - for n in self.iterchildren(): + if self.text: # type: ignore[attr-defined] + yield DomTextNode(self.text) # type: ignore[attr-defined] + for n in self.iterchildren(): # type: ignore[attr-defined] yield n if n.tail: yield DomTextNode(n.tail) def getElementsByTagName(self, name): - return self.iterdescendants(name) + return self.iterdescendants(name) # type: ignore[attr-defined] def getElementById(self, i): - return self.get_element_by_id(i) + return self.get_element_by_id(i) # type: ignore[attr-defined] @property def data(self): @@ -145,21 +148,19 @@ def data(self): raise RuntimeError def toxml(self, encoding=None): - return tostring(self, encoding=encoding if encoding is not None else "unicode") + return tostring(self, encoding=encoding if encoding is not None else "unicode") # type: ignore[call-overload] class DomHtmlElementClassLookup(HtmlElementClassLookup): def __init__(self): - super(DomHtmlElementClassLookup, self).__init__() + super().__init__() self._lookups = {} def lookup(self, node_type, document, namespace, name): k = (node_type, document, namespace, name) t = self._lookups.get(k) if t is None: - cur = super(DomHtmlElementClassLookup, self).lookup( - node_type, document, namespace, name - ) + cur = super().lookup(node_type, document, namespace, name) newtype = type("Dom" + cur.__name__, (cur, DomHtmlMixin), {}) self._lookups[k] = newtype return newtype @@ -173,6 +174,6 @@ class XmlDomHTMLParser(HTMLParser): """ def __init__(self, **kwargs): - super(HTMLParser, self).__init__(**kwargs) + super().__init__(**kwargs) parser_lookup = DomHtmlElementClassLookup() self.set_element_class_lookup(parser_lookup) diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 00000000..a414d4d6 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,49 @@ +[mypy] +show_column_numbers=True +show_error_codes = True + +# nice to allow using types for docs even when they are Any +disallow_any_unimported=False +disallow_any_expr=False +disallow_any_decorated=False +disallow_any_explicit=False +disallow_any_generics=True +disallow_subclassing_any=False + +disallow_untyped_calls=False +disallow_untyped_defs=True +disallow_incomplete_defs=True +check_untyped_defs=True +# @pytest.mark.asyncio is untyped +disallow_untyped_decorators=False + +no_implicit_optional=True +strict_optional=True + +warn_redundant_casts=True +warn_unused_ignores=True +warn_no_return=True +warn_return_any=True +warn_unreachable=False + +strict_equality=True + +ignore_missing_imports=False + +[mypy-mf2py.*] +ignore_missing_imports = True + +[mypy-pyRdfa.*] +ignore_missing_imports = True + +[mypy-rdflib.*] +ignore_missing_imports = True + +[mypy-jstyleson.*] +ignore_missing_imports = True + +[mypy-urlparse.*] +ignore_missing_imports = True + +[mypy-html_text] +ignore_missing_imports = True diff --git a/setup.py b/setup.py index c979896a..fd617066 100644 --- a/setup.py +++ b/setup.py @@ -16,8 +16,6 @@ def get_version(): return f.read().strip() -requirements = [] - setup( name="extruct", version=get_version(), From da6ac8ff12e9efb7c7dac69d17281115169dfa4e Mon Sep 17 00:00:00 2001 From: Steve Dignam Date: Thu, 24 Nov 2022 17:24:48 -0500 Subject: [PATCH 02/11] fix format --- extruct/_extruct.py | 2 +- extruct/tool.py | 1 + extruct/w3cmicrodata.py | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/extruct/_extruct.py b/extruct/_extruct.py index 353a2500..9987a15d 100644 --- a/extruct/_extruct.py +++ b/extruct/_extruct.py @@ -1,6 +1,6 @@ import logging -from typing import Any, Callable import warnings +from typing import Any, Callable from extruct.dublincore import DublinCoreExtractor from extruct.jsonld import JsonLdExtractor diff --git a/extruct/tool.py b/extruct/tool.py index 04e3ed6f..feaf213b 100644 --- a/extruct/tool.py +++ b/extruct/tool.py @@ -1,4 +1,5 @@ from __future__ import annotations + import argparse import json from typing import Any diff --git a/extruct/w3cmicrodata.py b/extruct/w3cmicrodata.py index dba198f3..567e1b76 100644 --- a/extruct/w3cmicrodata.py +++ b/extruct/w3cmicrodata.py @@ -11,6 +11,7 @@ """ from __future__ import annotations + import collections from functools import partial from typing import Any, Set From 817c8f42b4f26764b709e47068e84493d6ab8557 Mon Sep 17 00:00:00 2001 From: Steve Dignam Date: Fri, 25 Nov 2022 15:29:30 -0500 Subject: [PATCH 03/11] fix --- extruct/_extruct.py | 8 +++----- setup.py | 1 + tests/__init__.py | 1 + tests/test_dublincore.py | 1 + tests/test_extruct.py | 8 ++++---- tests/test_extruct_uniform.py | 1 + tests/test_jsonld.py | 1 + tests/test_microdata.py | 1 + tests/test_microformat.py | 1 + tests/test_opengraph.py | 1 + tests/test_rdfa.py | 17 +++++++++-------- tests/test_tool.py | 11 ++++++----- tests/test_uniform.py | 1 + 13 files changed, 31 insertions(+), 22 deletions(-) diff --git a/extruct/_extruct.py b/extruct/_extruct.py index 9987a15d..ad67e884 100644 --- a/extruct/_extruct.py +++ b/extruct/_extruct.py @@ -25,7 +25,7 @@ def extract( return_html_node: bool = False, schema_context: str = "http://schema.org", with_og_array: bool = False, - **kwargs: None + url: str | None = None, # deprecated ) -> dict[str, list[dict[str, Any]]]: """ htmlstring: string with valid html document; @@ -45,15 +45,13 @@ def extract( The feature is supported only by microdata syntax. Each node is of `lxml.etree.Element` type. schema_context: schema's context for current page""" - if base_url is None and "url" in kwargs: + if base_url is None and url is not None: warnings.warn( '"url" argument is deprecated, please use "base_url"', DeprecationWarning, stacklevel=2, ) - base_url = kwargs.pop("url") - if kwargs: - raise TypeError("Unexpected keyword arguments") + base_url = url if not (isinstance(syntaxes, list) and all(v in SYNTAXES for v in syntaxes)): raise ValueError( "syntaxes must be a list with any or all (default) of" diff --git a/setup.py b/setup.py index fd617066..5a7e6636 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,4 @@ +# mypy: disallow_untyped_defs=False # -*- coding: utf-8 -*- import os diff --git a/tests/__init__.py b/tests/__init__.py index 95eb53d9..0311ec9b 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,3 +1,4 @@ +# mypy: disallow_untyped_defs=False # -*- coding: utf-8 -*- import json import os diff --git a/tests/test_dublincore.py b/tests/test_dublincore.py index 1f7684c5..52746e6d 100644 --- a/tests/test_dublincore.py +++ b/tests/test_dublincore.py @@ -1,3 +1,4 @@ +# mypy: disallow_untyped_defs=False # -*- coding: utf-8 -*- import json import unittest diff --git a/tests/test_extruct.py b/tests/test_extruct.py index 24e0179e..b2ed643f 100644 --- a/tests/test_extruct.py +++ b/tests/test_extruct.py @@ -1,3 +1,4 @@ +# mypy: disallow_untyped_defs=False # -*- coding: utf-8 -*- import json import unittest @@ -66,7 +67,7 @@ def test_deprecated_url(self): def test_extra_kwargs(self): body, expected = self._microdata_custom_url("product_custom_url.json") with self.assertRaises(TypeError): - extruct.extract(body, foo="bar") + extruct.extract(body, foo="bar") # type: ignore[call-arg] def _microdata_custom_url(self, test_file): body = get_testdata("schema.org", "product.html") @@ -85,10 +86,9 @@ def test_errors(self): data = extruct.extract(body) # ignore exceptions - expected = {} data = extruct.extract(body, errors="ignore") - assert data == expected + assert data == {} # ignore exceptions data = extruct.extract(body, errors="log") - assert data == expected + assert data == {} diff --git a/tests/test_extruct_uniform.py b/tests/test_extruct_uniform.py index 9695ed7c..d97f31d7 100644 --- a/tests/test_extruct_uniform.py +++ b/tests/test_extruct_uniform.py @@ -1,3 +1,4 @@ +# mypy: disallow_untyped_defs=False # -*- coding: utf-8 -*- import json import unittest diff --git a/tests/test_jsonld.py b/tests/test_jsonld.py index 95080990..b2d126cd 100644 --- a/tests/test_jsonld.py +++ b/tests/test_jsonld.py @@ -1,3 +1,4 @@ +# mypy: disallow_untyped_defs=False # -*- coding: utf-8 -*- import json import unittest diff --git a/tests/test_microdata.py b/tests/test_microdata.py index e830d259..d3741e2c 100644 --- a/tests/test_microdata.py +++ b/tests/test_microdata.py @@ -1,3 +1,4 @@ +# mypy: disallow_untyped_defs=False # -*- coding: utf-8 -*- import json import unittest diff --git a/tests/test_microformat.py b/tests/test_microformat.py index d1932092..8543d87c 100644 --- a/tests/test_microformat.py +++ b/tests/test_microformat.py @@ -1,3 +1,4 @@ +# mypy: disallow_untyped_defs=False # -*- coding: utf-8 -*- import json import unittest diff --git a/tests/test_opengraph.py b/tests/test_opengraph.py index 611c687d..cfc18b5c 100644 --- a/tests/test_opengraph.py +++ b/tests/test_opengraph.py @@ -1,3 +1,4 @@ +# mypy: disallow_untyped_defs=False # -*- coding: utf-8 -*- import json import unittest diff --git a/tests/test_rdfa.py b/tests/test_rdfa.py index 3ad96efa..6c081838 100644 --- a/tests/test_rdfa.py +++ b/tests/test_rdfa.py @@ -1,3 +1,4 @@ +# mypy: disallow_untyped_defs=False # -*- coding: utf-8 -*- import json import unittest @@ -26,11 +27,12 @@ class TestRDFa(unittest.TestCase): maxDiff = None def assertJsonLDEqual(self, a, b, normalize_bnode_ids=True): - json_kwargs = dict( - indent=2, separators=(",", ": "), sort_keys=True, ensure_ascii=True + sa = json.dumps( + a, indent=2, separators=(",", ": "), sort_keys=True, ensure_ascii=True + ) + sb = json.dumps( + b, indent=2, separators=(",", ": "), sort_keys=True, ensure_ascii=True ) - sa = json.dumps(a, **json_kwargs) - sb = json.dumps(b, **json_kwargs) if normalize_bnode_ids: sa = self.normalize_bnode_ids(sa) sb = self.normalize_bnode_ids(sb) @@ -45,10 +47,9 @@ def normalize_bnode_ids(self, jsld): return jsld def prettify(self, a, normalize_bnode_ids=True): - json_kwargs = dict( - indent=2, separators=(",", ": "), sort_keys=True, ensure_ascii=True + output = json.dumps( + a, indent=2, separators=(",", ": "), sort_keys=True, ensure_ascii=True ) - output = json.dumps(a, **json_kwargs) if normalize_bnode_ids: output = self.normalize_bnode_ids(output) return output @@ -95,7 +96,7 @@ def test_w3c_rdfaprimer(self): def mocked_fix_order(x, y, z): raise Exception() - rdfae._fix_order = mocked_fix_order + rdfae._fix_order = mocked_fix_order # type: ignore[assignment] data = rdfae.extract(body, base_url="http://www.example.com/index.html") self.assertJsonLDEqual(data, expected) diff --git a/tests/test_tool.py b/tests/test_tool.py index 33d35bca..5f237910 100644 --- a/tests/test_tool.py +++ b/tests/test_tool.py @@ -1,10 +1,11 @@ +# mypy: disallow_untyped_defs=False import json import unittest try: import unittest.mock as mock except ImportError: - import mock + import mock # type: ignore[no-redef] from requests.exceptions import HTTPError @@ -149,12 +150,12 @@ def test_main_all(self, mock_get): @mock.patch("extruct.tool.requests.get") def test_main_single_syntax(self, mock_get): - expected = { + data = { "opengraph": self.expected["opengraph"], "url": self.url, "status": "200 OK", } - expected = json.dumps(expected, indent=2, sort_keys=True) + expected = json.dumps(data, indent=2, sort_keys=True) mock_response = build_mock_response( url=self.url, content=get_testdata("songkick", "tovestyrke.html"), @@ -166,13 +167,13 @@ def test_main_single_syntax(self, mock_get): @mock.patch("extruct.tool.requests.get") def test_main_multiple_syntaxes(self, mock_get): - expected = { + data = { "opengraph": self.expected["opengraph"], "microdata": self.expected["microdata"], "url": self.url, "status": "200 OK", } - expected = json.dumps(expected, indent=2, sort_keys=True) + expected = json.dumps(data, indent=2, sort_keys=True) mock_response = build_mock_response( url=self.url, content=get_testdata("songkick", "tovestyrke.html"), diff --git a/tests/test_uniform.py b/tests/test_uniform.py index cf94d52e..ca27dd2c 100644 --- a/tests/test_uniform.py +++ b/tests/test_uniform.py @@ -1,3 +1,4 @@ +# mypy: disallow_untyped_defs=False import unittest import extruct From 558dce346250c8c2ae6a651e18a4b7452838799f Mon Sep 17 00:00:00 2001 From: Steve Dignam Date: Fri, 25 Nov 2022 16:40:38 -0500 Subject: [PATCH 04/11] fix types --- extruct/_extruct.py | 1 + 1 file changed, 1 insertion(+) diff --git a/extruct/_extruct.py b/extruct/_extruct.py index ad67e884..551caf28 100644 --- a/extruct/_extruct.py +++ b/extruct/_extruct.py @@ -1,3 +1,4 @@ +from __future__ import annotations import logging import warnings from typing import Any, Callable From 4f39695d75b337e804c47929bd6c8e4f0b79e8d4 Mon Sep 17 00:00:00 2001 From: Steve Dignam Date: Fri, 25 Nov 2022 16:44:09 -0500 Subject: [PATCH 05/11] fmt --- extruct/_extruct.py | 1 + 1 file changed, 1 insertion(+) diff --git a/extruct/_extruct.py b/extruct/_extruct.py index 551caf28..cbe2a1a8 100644 --- a/extruct/_extruct.py +++ b/extruct/_extruct.py @@ -1,4 +1,5 @@ from __future__ import annotations + import logging import warnings from typing import Any, Callable From 6f2c933cef3b1026b9e7f07f062e0089b88b1744 Mon Sep 17 00:00:00 2001 From: Steve Dignam Date: Fri, 25 Nov 2022 16:47:46 -0500 Subject: [PATCH 06/11] fix --- extruct/xmldom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extruct/xmldom.py b/extruct/xmldom.py index 8d477a58..fe0a03ba 100644 --- a/extruct/xmldom.py +++ b/extruct/xmldom.py @@ -85,7 +85,7 @@ def localName(self): return self.xpath("local-name(.)") # type: ignore[attr-defined] def hasAttribute(self, name): - return name in self.attri # type: ignore[attr-defined] + return name in self.attrib # type: ignore[attr-defined] def getAttribute(self, name): return self.get(name) # type: ignore[attr-defined] From 6d91961d89744cccd7c0cb4953bc8221d470fd39 Mon Sep 17 00:00:00 2001 From: Steve Dignam Date: Tue, 29 Nov 2022 21:00:32 -0500 Subject: [PATCH 07/11] cut py36 support --- tox.ini | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tox.ini b/tox.ini index 96c5607a..1da7f1cf 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py36, py37, py38, py39 +envlist = py37, py38, py39 [testenv] @@ -8,13 +8,6 @@ deps = commands = py.test --cov-report=term --cov-report= --cov=extruct {posargs:extruct tests} -[testenv:py36] -deps = - setuptools<58 # https://stackoverflow.com/a/69100830/217088 -commands = - pip install -r requirements-dev.txt - py.test --cov-report=term --cov-report= --cov=extruct {posargs:extruct tests} - [testenv:py38] commands = py.test --cov-report=term --cov-report= --cov=extruct {posargs:extruct tests} From e4b0530b19f188984ba5c6219decdbb76d454d5c Mon Sep 17 00:00:00 2001 From: Steve Dignam Date: Tue, 29 Nov 2022 21:02:43 -0500 Subject: [PATCH 08/11] cut py36 support --- .github/workflows/python-package.yml | 2 +- setup.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index eff718d5..a623f953 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.6', '3.7', '3.8', '3.9'] + python-version: ['3.7', '3.8', '3.9'] steps: - uses: actions/checkout@v2 diff --git a/setup.py b/setup.py index 5a7e6636..ebc425fa 100644 --- a/setup.py +++ b/setup.py @@ -65,7 +65,6 @@ def get_version(): "Natural Language :: English", "Operating System :: OS Independent", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", From 0065b4d33f068b60e4fe361dc17bec24dac98ad0 Mon Sep 17 00:00:00 2001 From: Steve Dignam Date: Tue, 13 Dec 2022 23:03:19 -0500 Subject: [PATCH 09/11] mypy: move config to pyproject --- extruct/rdfa.py | 2 +- mypy.ini | 49 ------------------------------------------------- pyproject.toml | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 50 deletions(-) delete mode 100644 mypy.ini create mode 100644 pyproject.toml diff --git a/extruct/rdfa.py b/extruct/rdfa.py index 273b0b57..74203bab 100644 --- a/extruct/rdfa.py +++ b/extruct/rdfa.py @@ -17,7 +17,7 @@ from pyRdfa import pyRdfa as PyRdfa from pyRdfa.initialcontext import initial_context from rdflib import Graph -from rdflib import logger as rdflib_logger # type: ignore[no-redef] +from rdflib import logger as rdflib_logger from extruct.utils import parse_xmldom_html diff --git a/mypy.ini b/mypy.ini deleted file mode 100644 index a414d4d6..00000000 --- a/mypy.ini +++ /dev/null @@ -1,49 +0,0 @@ -[mypy] -show_column_numbers=True -show_error_codes = True - -# nice to allow using types for docs even when they are Any -disallow_any_unimported=False -disallow_any_expr=False -disallow_any_decorated=False -disallow_any_explicit=False -disallow_any_generics=True -disallow_subclassing_any=False - -disallow_untyped_calls=False -disallow_untyped_defs=True -disallow_incomplete_defs=True -check_untyped_defs=True -# @pytest.mark.asyncio is untyped -disallow_untyped_decorators=False - -no_implicit_optional=True -strict_optional=True - -warn_redundant_casts=True -warn_unused_ignores=True -warn_no_return=True -warn_return_any=True -warn_unreachable=False - -strict_equality=True - -ignore_missing_imports=False - -[mypy-mf2py.*] -ignore_missing_imports = True - -[mypy-pyRdfa.*] -ignore_missing_imports = True - -[mypy-rdflib.*] -ignore_missing_imports = True - -[mypy-jstyleson.*] -ignore_missing_imports = True - -[mypy-urlparse.*] -ignore_missing_imports = True - -[mypy-html_text] -ignore_missing_imports = True diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..49ff00dc --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,40 @@ +[tool.mypy] +show_column_numbers = true +show_error_codes = true + +disallow_any_unimported=false +disallow_any_expr=false +disallow_any_decorated=false +disallow_any_explicit=false +disallow_any_generics=true +disallow_subclassing_any=false + +disallow_untyped_calls=false +disallow_untyped_defs=true +disallow_incomplete_defs=true +check_untyped_defs=true +disallow_untyped_decorators=false + +no_implicit_optional=true +strict_optional=true + +warn_redundant_casts=true +warn_unused_ignores=true +warn_no_return=true +warn_return_any=true +warn_unreachable=false + +strict_equality=true + +ignore_missing_imports=false + +[[tool.mypy.overrides]] +module = [ + 'mf2py.*', + 'pyRdfa.*', + 'rdflib.*', + 'jstyleson.*', + 'urlparse.*', + 'html_text.*', +] +ignore_missing_imports=true From 0c1d8d9e06243edcde80238d79e18a717c2cce61 Mon Sep 17 00:00:00 2001 From: Steve Dignam Date: Tue, 13 Dec 2022 23:05:51 -0500 Subject: [PATCH 10/11] fix --- extruct/rdfa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extruct/rdfa.py b/extruct/rdfa.py index 74203bab..273b0b57 100644 --- a/extruct/rdfa.py +++ b/extruct/rdfa.py @@ -17,7 +17,7 @@ from pyRdfa import pyRdfa as PyRdfa from pyRdfa.initialcontext import initial_context from rdflib import Graph -from rdflib import logger as rdflib_logger +from rdflib import logger as rdflib_logger # type: ignore[no-redef] from extruct.utils import parse_xmldom_html From 945a7e64a528dadc555ef29da16b7c9c33685a23 Mon Sep 17 00:00:00 2001 From: Steve Dignam Date: Fri, 16 Dec 2022 20:10:51 -0500 Subject: [PATCH 11/11] support 3.10 and 3.11 --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index a623f953..850a22ba 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.7', '3.8', '3.9'] + python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] steps: - uses: actions/checkout@v2