Skip to content

Commit

Permalink
Merge pull request #107 from DerwenAI/update
Browse files Browse the repository at this point in the history
clean up and optimizations
  • Loading branch information
ceteri authored Mar 10, 2021
2 parents 1b76ddc + 3ce7b63 commit 8707c25
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 92 deletions.
17 changes: 9 additions & 8 deletions changelog.txt
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
# PyTextRank changelog

## 3.0.2
## 3.1.0

2021-03-??

* updated links on PyPi
* `pylint` coverage for code checking
* linking definitions and citations in source code apidocs to our online docs
* rename `master` branch to `main`
* add a factory class that assigns each doc its own Textrank object; kudos @Ankush-Chander
* refactor the stopwords feature as a constructor argument
* add `get_unit_vector()` method to expose the characteristic *unit vector*
* add `calc_sent_dist()` method to expose the sentence distance measures (for summarization)
* unit test for summarization
* add contributor instructions
* add a factory class that assigns each doc its own Textrank object; kudos @Ankush-Chander
* rename `master` branch to `main`
* include a unit test for summarization
* updated contributor instructions
* `pylint` coverage for code checking
* linking definitions and citations in source code apidocs to our online docs
* updated links on PyPi


## 3.0.1
Expand Down
1 change: 1 addition & 0 deletions pytextrank/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def _create_component_tr (
scrubber = scrubber,
)


@Language.factory("positionrank", default_config=_DEFAULT_CONFIG)
def _create_component_pr (
nlp: Language, # pylint: disable=W0613
Expand Down
111 changes: 35 additions & 76 deletions pytextrank/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
# -*- coding: utf-8 -*-

"""
Implements the base class for `TextRank`, with placeholder methods
to be used by subclasses for algorithm extensions.
Implements the base class for `TextRank`
with placeholder methods to be used by subclasses for algorithm extensions.
"""

from .util import groupby_apply, default_scrubber
Expand Down Expand Up @@ -104,9 +104,10 @@ class VectorElem:
phrase_id: int
coord: float


class BaseTextRankFactory:
"""
A factory class that provides the documnt with its instance of
A factory class that provides the document with its instance of
`BaseTextRank`
"""

Expand All @@ -124,21 +125,19 @@ def __init__ (
scrubber: typing.Optional[typing.Callable] = None,
) -> None:
"""
Constructor for a `Factory` object
Constructor for a factory used to instantiate the PyTextRank pipeline components.
edge_weight:
default weight for an edge
pos_kept:
parts of speech tags to be kept; adjust this if strings representing
the POS tags change
parts of speech tags to be kept; adjust this if strings representing the POS tags change
token_lookback:
the window for neighboring tokens (similar to a skip gram)
the window for neighboring tokens similar to a *skip gram*
scrubber:
optional "scrubber" function to clean up punctuation from a token;
if `None` then defaults to `pytextrank.default_scrubber`
optional "scrubber" function to clean up punctuation from a token; if `None` then defaults to `pytextrank.default_scrubber`
"""
self.edge_weight: float = edge_weight
self.token_lookback: int = token_lookback
Expand All @@ -153,17 +152,8 @@ def __init__ (
else:
self.scrubber = default_scrubber

self.doc: Doc = None
self.stopwords: dict = defaultdict(list)

# effectively, performs the same work as the `reset()` method;
# called explicitly here for the sake of type annotations
self.elapsed_time: float = 0.0
self.lemma_graph: nx.DiGraph = nx.DiGraph()
self.phrases: dict = defaultdict(list)
self.ranks: typing.Dict[Lemma, float] = {}
self.seen_lemma: typing.Dict[Lemma, typing.Set[int]] = OrderedDict()


def __call__ (
self,
Expand All @@ -173,11 +163,11 @@ def __call__ (
Set the extension attributes on a `spaCy` [`Doc`](https://spacy.io/api/doc)
document to create a *pipeline component* for `TextRank` as
a stateful component, invoked when the document gets processed.
See: <https://spacy.io/usage/processing-pipelines#pipelines>
doc:
a document container for accessing the annotations produced by earlier
stages of the `spaCy` pipeline
a document container, providing the annotations produced by earlier stages of the `spaCy` pipeline
"""
Doc.set_extension("textrank", force=True, default=None)
Doc.set_extension("phrases", force=True, default=[])
Expand All @@ -194,72 +184,47 @@ def __call__ (
return doc


def reset (
self
) -> None:
"""
Reinitialize the data structures needed for extracting phrases,
removing any pre-existing state.
"""
self.elapsed_time = 0.0
self.lemma_graph = nx.DiGraph()
self.phrases = defaultdict(list)
self.ranks = {}
self.seen_lemma = OrderedDict()


class BaseTextRank:
"""
Implements the *TextRank* algorithm defined by
[[mihalcea04textrank]](https://derwen.ai/docs/ptr/biblio/#mihalcea04textrank),
deployed as a `spaCy` pipeline component.
"""

_EDGE_WEIGHT: float = 1.0
_POS_KEPT: typing.List[str] = ["ADJ", "NOUN", "PROPN", "VERB"]
_TOKEN_LOOKBACK: int = 3
This class does not get called directly; instantiate its factory
instead.
"""

def __init__ (
self,
doc,
*,
edge_weight: float = _EDGE_WEIGHT,
pos_kept: typing.List[str] = None,
token_lookback: int = _TOKEN_LOOKBACK,
scrubber: typing.Optional[typing.Callable] = None,
doc: Doc,
edge_weight: float,
pos_kept: typing.List[str],
token_lookback: int,
scrubber: typing.Callable,
) -> None:
"""
Constructor for a `TextRank` object
Constructor for a `TextRank` object.
doc:
a document container, providing the annotations produced by earlier stages of the `spaCy` pipeline
edge_weight:
default weight for an edge
pos_kept:
parts of speech tags to be kept; adjust this if strings representing
the POS tags change
parts of speech tags to be kept; adjust this if strings representing the POS tags change
token_lookback:
the window for neighboring tokens (similar to a skip gram)
the window for neighboring tokens similar to a *skip gram*
scrubber:
optional "scrubber" function to clean up punctuation from a token;
if `None` then defaults to `pytextrank.default_scrubber`
optional "scrubber" function to clean up punctuation from a token
"""
self.doc: Doc = doc
self.edge_weight: float = edge_weight
self.token_lookback: int = token_lookback

if pos_kept:
self.pos_kept: typing.List[str] = pos_kept
else:
self.pos_kept = self._POS_KEPT

if scrubber:
self.scrubber: typing.Callable = scrubber
else:
self.scrubber = default_scrubber

self.doc: Doc = doc
self.pos_kept: typing.List[str] = pos_kept
self.scrubber: typing.Callable = scrubber
self.stopwords: dict = defaultdict(list)

# effectively, performs the same work as the `reset()` method;
Expand All @@ -270,6 +235,7 @@ def __init__ (
self.ranks: typing.Dict[Lemma, float] = {}
self.seen_lemma: typing.Dict[Lemma, typing.Set[int]] = OrderedDict()


def reset (
self
) -> None:
Expand Down Expand Up @@ -300,13 +266,10 @@ def load_stopwords (
and bias/distort the results.
data:
dictionary of `lemma: [pos]` items to define the stop words, where
each item has a key as a lemmatized token and a value as a list of POS
tags
dictionary of `lemma: [pos]` items to define the stop words, where each item has a key as a lemmatized token and a value as a list of POS tags
path:
optional [`pathlib.Path`](https://docs.python.org/3/library/pathlib.html)
of a JSON file – in lieu of providing a `data` parameter
optional [`pathlib.Path`](https://docs.python.org/3/library/pathlib.html) of a JSON file – in lieu of providing a `data` parameter
"""
if data:
self.stopwords = data
Expand All @@ -332,9 +295,8 @@ def calc_textrank (
returns:
list of ranked phrases, in descending order
"""
self.reset()
t0 = time.time()

self.reset()
self.lemma_graph = self._construct_graph()

# to run the algorithm, we use the NetworkX implementation
Expand Down Expand Up @@ -416,8 +378,7 @@ def _keep_token (
a parsed `spaCy` [`Token`](https://spacy.io/api/token) to be evaluated
returns:
boolean value for whether to keep this token as a node in the lemma
graph
boolean value for whether to keep this token as a node in the lemma graph
"""
lemma = token.lemma_.lower().strip()

Expand Down Expand Up @@ -505,8 +466,7 @@ def _collect_phrases (
rank metrics corresponding to each node
returns:
phrases extracted from the lemma graph, each with an aggregate rank
metric
phrases extracted from the lemma graph, each with an aggregate rank metric
"""
phrases: typing.Dict[Span, float] = {
span: sum(
Expand Down Expand Up @@ -704,8 +664,7 @@ def summary (
total number of sentences to yield for the extractive summarization
preserve_order:
flag to preserve the order of sentences as they originally occurred in
the source text; defaults to `False`
flag to preserve the order of sentences as they originally occurred in the source text; defaults to `False`
yields:
texts for sentences, in order
Expand Down
19 changes: 11 additions & 8 deletions pytextrank/positionrank.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

class PositionRankFactory (BaseTextRankFactory):
"""
A factory class that provides the documnt with its instance of
A factory class that provides the document with its instance of
`PositionRank`
"""

Expand All @@ -22,23 +22,23 @@ def __call__ (
) -> Doc:
"""
Set the extension attributes on a `spaCy` [`Doc`](https://spacy.io/api/doc)
document to create a *pipeline component* for `PositionRank` as a
stateful component, invoked when the document gets processed.
document to create a *pipeline component* for `PositionRank` as
a stateful component, invoked when the document gets processed.
See: <https://spacy.io/usage/processing-pipelines#pipelines>
doc:
a document container for accessing the annotations produced by earlier stages of the `spaCy` pipeline
a document container, providing the annotations produced by earlier stages of the `spaCy` pipeline
"""
Doc.set_extension("textrank", force=True, default=None)
Doc.set_extension("phrases", force=True, default=[])

doc._.textrank = PositionRank(
doc,
edge_weight=self.edge_weight,
pos_kept=self.pos_kept,
token_lookback=self.token_lookback,
scrubber=self.scrubber,
edge_weight = self.edge_weight,
pos_kept = self.pos_kept,
token_lookback = self.token_lookback,
scrubber = self.scrubber,
)

doc._.phrases = doc._.textrank.calc_textrank()
Expand All @@ -50,6 +50,9 @@ class PositionRank (BaseTextRank):
Implements the *PositionRank* algorithm described by
[[florescuc17]](https://derwen.ai/docs/ptr/biblio/#florescuc17),
deployed as a `spaCy` pipeline component.
This class does not get called directly; instantiate its factory
instead.
"""

def get_personalization (
Expand Down
4 changes: 4 additions & 0 deletions sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@

# now add `"word": ["NOUN"]` to the stop words, to remove instances
# of `"word"` or `"words"` then see how the ranked phrases differ...

# TODO: refactor stopwords as a constructor argument
#nlp.add_pipe("textrank")

doc = nlp(text)
tr = doc._.textrank

Expand Down

0 comments on commit 8707c25

Please sign in to comment.