From 3ce7b63d6ada554030851d3b60b3a358dcbeeab1 Mon Sep 17 00:00:00 2001 From: Paco Nathan Date: Wed, 10 Mar 2021 07:09:19 -0800 Subject: [PATCH] clean up and optimizations --- changelog.txt | 17 +++--- pytextrank/__init__.py | 1 + pytextrank/base.py | 111 ++++++++++++------------------------- pytextrank/positionrank.py | 19 ++++--- sample.py | 4 ++ 5 files changed, 60 insertions(+), 92 deletions(-) diff --git a/changelog.txt b/changelog.txt index 074845f..d977a70 100644 --- a/changelog.txt +++ b/changelog.txt @@ -1,18 +1,19 @@ # PyTextRank changelog -## 3.0.2 +## 3.1.0 2021-03-?? - * updated links on PyPi - * `pylint` coverage for code checking - * linking definitions and citations in source code apidocs to our online docs + * rename `master` branch to `main` + * add a factory class that assigns each doc its own Textrank object; kudos @Ankush-Chander + * refactor the stopwords feature as a constructor argument * add `get_unit_vector()` method to expose the characteristic *unit vector* * add `calc_sent_dist()` method to expose the sentence distance measures (for summarization) - * unit test for summarization - * add contributor instructions - * add a factory class that assigns each doc its own Textrank object; kudos @Ankush-Chander - * rename `master` branch to `main` + * include a unit test for summarization + * updated contributor instructions + * `pylint` coverage for code checking + * linking definitions and citations in source code apidocs to our online docs + * updated links on PyPi ## 3.0.1 diff --git a/pytextrank/__init__.py b/pytextrank/__init__.py index de912ee..1a948bf 100644 --- a/pytextrank/__init__.py +++ b/pytextrank/__init__.py @@ -41,6 +41,7 @@ def _create_component_tr ( scrubber = scrubber, ) + @Language.factory("positionrank", default_config=_DEFAULT_CONFIG) def _create_component_pr ( nlp: Language, # pylint: disable=W0613 diff --git a/pytextrank/base.py b/pytextrank/base.py index b1e5310..31270ea 100644 --- a/pytextrank/base.py +++ b/pytextrank/base.py @@ -2,8 +2,8 @@ # -*- coding: utf-8 -*- """ -Implements the base class for `TextRank`, with placeholder methods -to be used by subclasses for algorithm extensions. +Implements the base class for `TextRank` – +with placeholder methods to be used by subclasses for algorithm extensions. """ from .util import groupby_apply, default_scrubber @@ -104,9 +104,10 @@ class VectorElem: phrase_id: int coord: float + class BaseTextRankFactory: """ -A factory class that provides the documnt with its instance of +A factory class that provides the document with its instance of `BaseTextRank` """ @@ -124,21 +125,19 @@ def __init__ ( scrubber: typing.Optional[typing.Callable] = None, ) -> None: """ -Constructor for a `Factory` object +Constructor for a factory used to instantiate the PyTextRank pipeline components. edge_weight: default weight for an edge pos_kept: -parts of speech tags to be kept; adjust this if strings representing -the POS tags change +parts of speech tags to be kept; adjust this if strings representing the POS tags change token_lookback: -the window for neighboring tokens (similar to a skip gram) +the window for neighboring tokens – similar to a *skip gram* scrubber: -optional "scrubber" function to clean up punctuation from a token; -if `None` then defaults to `pytextrank.default_scrubber` +optional "scrubber" function to clean up punctuation from a token; if `None` then defaults to `pytextrank.default_scrubber` """ self.edge_weight: float = edge_weight self.token_lookback: int = token_lookback @@ -153,17 +152,8 @@ def __init__ ( else: self.scrubber = default_scrubber - self.doc: Doc = None self.stopwords: dict = defaultdict(list) - # effectively, performs the same work as the `reset()` method; - # called explicitly here for the sake of type annotations - self.elapsed_time: float = 0.0 - self.lemma_graph: nx.DiGraph = nx.DiGraph() - self.phrases: dict = defaultdict(list) - self.ranks: typing.Dict[Lemma, float] = {} - self.seen_lemma: typing.Dict[Lemma, typing.Set[int]] = OrderedDict() - def __call__ ( self, @@ -173,11 +163,11 @@ def __call__ ( Set the extension attributes on a `spaCy` [`Doc`](https://spacy.io/api/doc) document to create a *pipeline component* for `TextRank` as a stateful component, invoked when the document gets processed. + See: doc: -a document container for accessing the annotations produced by earlier -stages of the `spaCy` pipeline +a document container, providing the annotations produced by earlier stages of the `spaCy` pipeline """ Doc.set_extension("textrank", force=True, default=None) Doc.set_extension("phrases", force=True, default=[]) @@ -194,72 +184,47 @@ def __call__ ( return doc - def reset ( - self - ) -> None: - """ -Reinitialize the data structures needed for extracting phrases, -removing any pre-existing state. - """ - self.elapsed_time = 0.0 - self.lemma_graph = nx.DiGraph() - self.phrases = defaultdict(list) - self.ranks = {} - self.seen_lemma = OrderedDict() - - class BaseTextRank: """ Implements the *TextRank* algorithm defined by [[mihalcea04textrank]](https://derwen.ai/docs/ptr/biblio/#mihalcea04textrank), deployed as a `spaCy` pipeline component. - """ - - _EDGE_WEIGHT: float = 1.0 - _POS_KEPT: typing.List[str] = ["ADJ", "NOUN", "PROPN", "VERB"] - _TOKEN_LOOKBACK: int = 3 +This class does not get called directly; instantiate its factory +instead. + """ def __init__ ( self, - doc, - *, - edge_weight: float = _EDGE_WEIGHT, - pos_kept: typing.List[str] = None, - token_lookback: int = _TOKEN_LOOKBACK, - scrubber: typing.Optional[typing.Callable] = None, + doc: Doc, + edge_weight: float, + pos_kept: typing.List[str], + token_lookback: int, + scrubber: typing.Callable, ) -> None: """ -Constructor for a `TextRank` object +Constructor for a `TextRank` object. + + doc: +a document container, providing the annotations produced by earlier stages of the `spaCy` pipeline edge_weight: default weight for an edge pos_kept: -parts of speech tags to be kept; adjust this if strings representing -the POS tags change +parts of speech tags to be kept; adjust this if strings representing the POS tags change token_lookback: -the window for neighboring tokens (similar to a skip gram) +the window for neighboring tokens – similar to a *skip gram* scrubber: -optional "scrubber" function to clean up punctuation from a token; -if `None` then defaults to `pytextrank.default_scrubber` +optional "scrubber" function to clean up punctuation from a token """ + self.doc: Doc = doc self.edge_weight: float = edge_weight self.token_lookback: int = token_lookback - - if pos_kept: - self.pos_kept: typing.List[str] = pos_kept - else: - self.pos_kept = self._POS_KEPT - - if scrubber: - self.scrubber: typing.Callable = scrubber - else: - self.scrubber = default_scrubber - - self.doc: Doc = doc + self.pos_kept: typing.List[str] = pos_kept + self.scrubber: typing.Callable = scrubber self.stopwords: dict = defaultdict(list) # effectively, performs the same work as the `reset()` method; @@ -270,6 +235,7 @@ def __init__ ( self.ranks: typing.Dict[Lemma, float] = {} self.seen_lemma: typing.Dict[Lemma, typing.Set[int]] = OrderedDict() + def reset ( self ) -> None: @@ -300,13 +266,10 @@ def load_stopwords ( and bias/distort the results. data: -dictionary of `lemma: [pos]` items to define the stop words, where -each item has a key as a lemmatized token and a value as a list of POS -tags +dictionary of `lemma: [pos]` items to define the stop words, where each item has a key as a lemmatized token and a value as a list of POS tags path: -optional [`pathlib.Path`](https://docs.python.org/3/library/pathlib.html) -of a JSON file – in lieu of providing a `data` parameter +optional [`pathlib.Path`](https://docs.python.org/3/library/pathlib.html) of a JSON file – in lieu of providing a `data` parameter """ if data: self.stopwords = data @@ -332,9 +295,8 @@ def calc_textrank ( returns: list of ranked phrases, in descending order """ - self.reset() t0 = time.time() - + self.reset() self.lemma_graph = self._construct_graph() # to run the algorithm, we use the NetworkX implementation @@ -416,8 +378,7 @@ def _keep_token ( a parsed `spaCy` [`Token`](https://spacy.io/api/token) to be evaluated returns: -boolean value for whether to keep this token as a node in the lemma -graph +boolean value for whether to keep this token as a node in the lemma graph """ lemma = token.lemma_.lower().strip() @@ -505,8 +466,7 @@ def _collect_phrases ( rank metrics corresponding to each node returns: -phrases extracted from the lemma graph, each with an aggregate rank -metric +phrases extracted from the lemma graph, each with an aggregate rank metric """ phrases: typing.Dict[Span, float] = { span: sum( @@ -704,8 +664,7 @@ def summary ( total number of sentences to yield for the extractive summarization preserve_order: -flag to preserve the order of sentences as they originally occurred in -the source text; defaults to `False` +flag to preserve the order of sentences as they originally occurred in the source text; defaults to `False` yields: texts for sentences, in order diff --git a/pytextrank/positionrank.py b/pytextrank/positionrank.py index 59b4b8e..aa7873a 100644 --- a/pytextrank/positionrank.py +++ b/pytextrank/positionrank.py @@ -11,7 +11,7 @@ class PositionRankFactory (BaseTextRankFactory): """ -A factory class that provides the documnt with its instance of +A factory class that provides the document with its instance of `PositionRank` """ @@ -22,23 +22,23 @@ def __call__ ( ) -> Doc: """ Set the extension attributes on a `spaCy` [`Doc`](https://spacy.io/api/doc) -document to create a *pipeline component* for `PositionRank` as a -stateful component, invoked when the document gets processed. +document to create a *pipeline component* for `PositionRank` as +a stateful component, invoked when the document gets processed. See: doc: -a document container for accessing the annotations produced by earlier stages of the `spaCy` pipeline +a document container, providing the annotations produced by earlier stages of the `spaCy` pipeline """ Doc.set_extension("textrank", force=True, default=None) Doc.set_extension("phrases", force=True, default=[]) doc._.textrank = PositionRank( doc, - edge_weight=self.edge_weight, - pos_kept=self.pos_kept, - token_lookback=self.token_lookback, - scrubber=self.scrubber, + edge_weight = self.edge_weight, + pos_kept = self.pos_kept, + token_lookback = self.token_lookback, + scrubber = self.scrubber, ) doc._.phrases = doc._.textrank.calc_textrank() @@ -50,6 +50,9 @@ class PositionRank (BaseTextRank): Implements the *PositionRank* algorithm described by [[florescuc17]](https://derwen.ai/docs/ptr/biblio/#florescuc17), deployed as a `spaCy` pipeline component. + +This class does not get called directly; instantiate its factory +instead. """ def get_personalization ( diff --git a/sample.py b/sample.py index 39900cf..aed7234 100755 --- a/sample.py +++ b/sample.py @@ -64,6 +64,10 @@ # now add `"word": ["NOUN"]` to the stop words, to remove instances # of `"word"` or `"words"` then see how the ranked phrases differ... + +# TODO: refactor stopwords as a constructor argument +#nlp.add_pipe("textrank") + doc = nlp(text) tr = doc._.textrank