From 3ce7b63d6ada554030851d3b60b3a358dcbeeab1 Mon Sep 17 00:00:00 2001
From: Paco Nathan <ceteri@gmail.com>
Date: Wed, 10 Mar 2021 07:09:19 -0800
Subject: [PATCH] clean up and optimizations

---
 changelog.txt              |  17 +++---
 pytextrank/__init__.py     |   1 +
 pytextrank/base.py         | 111 ++++++++++++-------------------------
 pytextrank/positionrank.py |  19 ++++---
 sample.py                  |   4 ++
 5 files changed, 60 insertions(+), 92 deletions(-)

diff --git a/changelog.txt b/changelog.txt
index 074845f..d977a70 100644
--- a/changelog.txt
+++ b/changelog.txt
@@ -1,18 +1,19 @@
 # PyTextRank changelog
 
-## 3.0.2
+## 3.1.0
 
 2021-03-??
 
-  * updated links on PyPi
-  * `pylint` coverage for code checking
-  * linking definitions and citations in source code apidocs to our online docs
+  * rename `master` branch to `main`
+  * add a factory class that assigns each doc its own Textrank object; kudos @Ankush-Chander
+  * refactor the stopwords feature as a constructor argument
   * add `get_unit_vector()` method to expose the characteristic *unit vector*
   * add `calc_sent_dist()` method to expose the sentence distance measures (for summarization)
-  * unit test for summarization
-  * add contributor instructions
-  * add a factory class that assigns each doc its own Textrank object; kudos @Ankush-Chander
-  * rename `master` branch to `main`
+  * include a unit test for summarization
+  * updated contributor instructions
+  * `pylint` coverage for code checking
+  * linking definitions and citations in source code apidocs to our online docs
+  * updated links on PyPi
 
 
 ## 3.0.1
diff --git a/pytextrank/__init__.py b/pytextrank/__init__.py
index de912ee..1a948bf 100644
--- a/pytextrank/__init__.py
+++ b/pytextrank/__init__.py
@@ -41,6 +41,7 @@ def _create_component_tr (
         scrubber = scrubber,
         )
 
+
 @Language.factory("positionrank", default_config=_DEFAULT_CONFIG)
 def _create_component_pr (
     nlp: Language,  # pylint: disable=W0613
diff --git a/pytextrank/base.py b/pytextrank/base.py
index b1e5310..31270ea 100644
--- a/pytextrank/base.py
+++ b/pytextrank/base.py
@@ -2,8 +2,8 @@
 # -*- coding: utf-8 -*-
 
 """
-Implements the base class for `TextRank`, with placeholder methods
-to be used by subclasses for algorithm extensions.
+Implements the base class for `TextRank` –
+with placeholder methods to be used by subclasses for algorithm extensions.
 """
 
 from .util import groupby_apply, default_scrubber
@@ -104,9 +104,10 @@ class VectorElem:
     phrase_id: int
     coord: float
 
+
 class BaseTextRankFactory:
     """
-A factory class that provides the documnt with its instance of
+A factory class that provides the document with its instance of
 `BaseTextRank`
     """
 
@@ -124,21 +125,19 @@ def __init__ (
         scrubber: typing.Optional[typing.Callable] = None,
         ) -> None:
         """
-Constructor for a `Factory` object
+Constructor for a factory used to instantiate the PyTextRank pipeline components.
 
     edge_weight:
 default weight for an edge
 
     pos_kept:
-parts of speech tags to be kept; adjust this if strings representing
-the POS tags change
+parts of speech tags to be kept; adjust this if strings representing the POS tags change
 
     token_lookback:
-the window for neighboring tokens (similar to a skip gram)
+the window for neighboring tokens – similar to a *skip gram*
 
     scrubber:
-optional "scrubber" function to clean up punctuation from a token;
-if `None` then defaults to `pytextrank.default_scrubber`
+optional "scrubber" function to clean up punctuation from a token; if `None` then defaults to `pytextrank.default_scrubber`
         """
         self.edge_weight: float = edge_weight
         self.token_lookback: int = token_lookback
@@ -153,17 +152,8 @@ def __init__ (
         else:
             self.scrubber = default_scrubber
 
-        self.doc: Doc = None
         self.stopwords: dict = defaultdict(list)
 
-        # effectively, performs the same work as the `reset()` method;
-        # called explicitly here for the sake of type annotations
-        self.elapsed_time: float = 0.0
-        self.lemma_graph: nx.DiGraph = nx.DiGraph()
-        self.phrases: dict = defaultdict(list)
-        self.ranks: typing.Dict[Lemma, float] = {}
-        self.seen_lemma: typing.Dict[Lemma, typing.Set[int]] = OrderedDict()
-
 
     def __call__ (
         self,
@@ -173,11 +163,11 @@ def __call__ (
 Set the extension attributes on a `spaCy` [`Doc`](https://spacy.io/api/doc)
 document to create a *pipeline component* for `TextRank` as
 a stateful component, invoked when the document gets processed.
+
 See: <https://spacy.io/usage/processing-pipelines#pipelines>
 
     doc:
-a document container for accessing the annotations produced by earlier
-stages of the `spaCy` pipeline
+a document container, providing the annotations produced by earlier stages of the `spaCy` pipeline
         """
         Doc.set_extension("textrank", force=True, default=None)
         Doc.set_extension("phrases", force=True, default=[])
@@ -194,72 +184,47 @@ def __call__ (
         return doc
 
 
-    def reset (
-        self
-        ) -> None:
-        """
-Reinitialize the data structures needed for extracting phrases,
-removing any pre-existing state.
-        """
-        self.elapsed_time = 0.0
-        self.lemma_graph = nx.DiGraph()
-        self.phrases = defaultdict(list)
-        self.ranks = {}
-        self.seen_lemma = OrderedDict()
-
-
 class BaseTextRank:
     """
 Implements the *TextRank* algorithm defined by
 [[mihalcea04textrank]](https://derwen.ai/docs/ptr/biblio/#mihalcea04textrank),
 deployed as a `spaCy` pipeline component.
-    """
-
-    _EDGE_WEIGHT: float = 1.0
-    _POS_KEPT: typing.List[str] = ["ADJ", "NOUN", "PROPN", "VERB"]
-    _TOKEN_LOOKBACK: int = 3
 
+This class does not get called directly; instantiate its factory
+instead.
+    """
 
     def __init__ (
         self,
-        doc,
-        *,
-        edge_weight: float = _EDGE_WEIGHT,
-        pos_kept: typing.List[str] = None,
-        token_lookback: int = _TOKEN_LOOKBACK,
-        scrubber: typing.Optional[typing.Callable] = None,
+        doc: Doc,
+        edge_weight: float,
+        pos_kept: typing.List[str],
+        token_lookback: int,
+        scrubber: typing.Callable,
         ) -> None:
         """
-Constructor for a `TextRank` object
+Constructor for a `TextRank` object.
+
+    doc:
+a document container, providing the annotations produced by earlier stages of the `spaCy` pipeline
 
     edge_weight:
 default weight for an edge
 
     pos_kept:
-parts of speech tags to be kept; adjust this if strings representing
-the POS tags change
+parts of speech tags to be kept; adjust this if strings representing the POS tags change
 
     token_lookback:
-the window for neighboring tokens (similar to a skip gram)
+the window for neighboring tokens – similar to a *skip gram*
 
     scrubber:
-optional "scrubber" function to clean up punctuation from a token;
-if `None` then defaults to `pytextrank.default_scrubber`
+optional "scrubber" function to clean up punctuation from a token
         """
+        self.doc: Doc = doc
         self.edge_weight: float = edge_weight
         self.token_lookback: int = token_lookback
-
-        if pos_kept:
-            self.pos_kept: typing.List[str] = pos_kept
-        else:
-            self.pos_kept = self._POS_KEPT
-
-        if scrubber:
-            self.scrubber: typing.Callable = scrubber
-        else:
-            self.scrubber = default_scrubber
-
-        self.doc: Doc = doc
+        self.pos_kept: typing.List[str] = pos_kept
+        self.scrubber: typing.Callable = scrubber
         self.stopwords: dict = defaultdict(list)
 
         # effectively, performs the same work as the `reset()` method;
@@ -270,6 +235,7 @@ def __init__ (
         self.ranks: typing.Dict[Lemma, float] = {}
         self.seen_lemma: typing.Dict[Lemma, typing.Set[int]] = OrderedDict()
 
+
     def reset (
         self
         ) -> None:
@@ -300,13 +266,10 @@ def load_stopwords (
 and bias/distort the results.
 
     data:
-dictionary of `lemma: [pos]` items to define the stop words, where
-each item has a key as a lemmatized token and a value as a list of POS
-tags
+dictionary of `lemma: [pos]` items to define the stop words, where each item has a key as a lemmatized token and a value as a list of POS tags
 
     path:
-optional [`pathlib.Path`](https://docs.python.org/3/library/pathlib.html)
-of a JSON file – in lieu of providing a `data` parameter
+optional [`pathlib.Path`](https://docs.python.org/3/library/pathlib.html) of a JSON file – in lieu of providing a `data` parameter
         """
         if data:
             self.stopwords = data
@@ -332,9 +295,8 @@ def calc_textrank (
     returns:
 list of ranked phrases, in descending order
         """
-        self.reset()
         t0 = time.time()
-
+        self.reset()
         self.lemma_graph = self._construct_graph()
 
         # to run the algorithm, we use the NetworkX implementation
@@ -416,8 +378,7 @@ def _keep_token (
 a parsed `spaCy` [`Token`](https://spacy.io/api/token) to be evaluated
 
     returns:
-boolean value for whether to keep this token as a node in the lemma
-graph
+boolean value for whether to keep this token as a node in the lemma graph
         """
         lemma = token.lemma_.lower().strip()
 
@@ -505,8 +466,7 @@ def _collect_phrases (
 rank metrics corresponding to each node
 
     returns:
-phrases extracted from the lemma graph, each with an aggregate rank
-metric
+phrases extracted from the lemma graph, each with an aggregate rank metric
         """
         phrases: typing.Dict[Span, float] = {
             span: sum(
@@ -704,8 +664,7 @@ def summary (
 total number of sentences to yield for the extractive summarization
 
     preserve_order:
-flag to preserve the order of sentences as they originally occurred in
-the source text; defaults to `False`
+flag to preserve the order of sentences as they originally occurred in the source text; defaults to `False`
 
     yields:
 texts for sentences, in order
diff --git a/pytextrank/positionrank.py b/pytextrank/positionrank.py
index 59b4b8e..aa7873a 100644
--- a/pytextrank/positionrank.py
+++ b/pytextrank/positionrank.py
@@ -11,7 +11,7 @@
 
 class PositionRankFactory (BaseTextRankFactory):
     """
-A factory class that provides the documnt with its instance of
+A factory class that provides the document with its instance of
 `PositionRank`
     """
 
@@ -22,23 +22,23 @@ def __call__ (
         ) -> Doc:
         """
 Set the extension attributes on a `spaCy` [`Doc`](https://spacy.io/api/doc)
-document to create a *pipeline component* for `PositionRank` as a
-stateful component, invoked when the document gets processed.
+document to create a *pipeline component* for `PositionRank` as
+a stateful component, invoked when the document gets processed.
 
 See: <https://spacy.io/usage/processing-pipelines#pipelines>
 
     doc:
-a document container for accessing the annotations produced by earlier stages of the `spaCy` pipeline
+a document container, providing the annotations produced by earlier stages of the `spaCy` pipeline
         """
         Doc.set_extension("textrank", force=True, default=None)
         Doc.set_extension("phrases", force=True, default=[])
 
         doc._.textrank = PositionRank(
             doc,
-            edge_weight=self.edge_weight,
-            pos_kept=self.pos_kept,
-            token_lookback=self.token_lookback,
-            scrubber=self.scrubber,
+            edge_weight = self.edge_weight,
+            pos_kept = self.pos_kept,
+            token_lookback = self.token_lookback,
+            scrubber = self.scrubber,
             )
 
         doc._.phrases = doc._.textrank.calc_textrank()
@@ -50,6 +50,9 @@ class PositionRank (BaseTextRank):
 Implements the *PositionRank* algorithm described by
 [[florescuc17]](https://derwen.ai/docs/ptr/biblio/#florescuc17),
 deployed as a `spaCy` pipeline component.
+
+This class does not get called directly; instantiate its factory
+instead.
     """
 
     def get_personalization (
diff --git a/sample.py b/sample.py
index 39900cf..aed7234 100755
--- a/sample.py
+++ b/sample.py
@@ -64,6 +64,10 @@
 
 # now add `"word": ["NOUN"]` to the stop words, to remove instances
 # of `"word"` or `"words"` then see how the ranked phrases differ...
+
+# TODO: refactor stopwords as a constructor argument
+#nlp.add_pipe("textrank")
+
 doc = nlp(text)
 tr = doc._.textrank