From eb885d41643881c927ced194d4dcea4968c1ffc9 Mon Sep 17 00:00:00 2001 From: Oliver Borchers Date: Sun, 10 Apr 2022 21:22:47 +0200 Subject: [PATCH] Gensim Support for 4.0.0 (#69) * Added dockerfile for tests * Added test files * Updated for v2 * Fixed average tests * Removed test embeddings * Fixed base s2v * Fixed base s2v arguments * Added docs * Updated readme * updated readme --- .dockerignore | 4 + .gitignore | 3 +- Dockerfile | 21 + README.md | 69 +- docs/fse/index.html | 161 ++ docs/fse/inputs.html | 1256 +++++++++++ docs/fse/models/average.html | 715 ++++++ docs/fse/models/average_inner.html | 95 + docs/fse/models/base_s2v.html | 2636 ++++++++++++++++++++++ docs/fse/models/index.html | 106 + docs/fse/models/sentencevectors.html | 1649 ++++++++++++++ docs/fse/models/sif.html | 438 ++++ docs/fse/models/usif.html | 489 ++++ docs/fse/models/utils.html | 495 +++++ docs/fse/vectors.html | 421 ++++ fse/inputs.py | 4 +- fse/models/average.py | 31 +- fse/models/average_inner.c | 659 +++--- fse/models/average_inner.pyx | 27 +- fse/models/base_s2v.py | 62 +- fse/models/sentencevectors.py | 8 +- fse/models/sif.py | 18 +- fse/models/usif.py | 18 +- fse/vectors.py | 5 +- notebooks/STS-Benchmarks.ipynb | 3066 +++++++++++++++++++++----- notebooks/Speed Comparision.ipynb | 12 +- notebooks/Tutorial.ipynb | 310 +-- release.sh | 4 +- setup.py | 6 +- test/test_average.py | 24 +- test/test_base_s2v.py | 74 +- test/test_sentencevectors.py | 45 +- test/test_sif.py | 13 +- test/test_usif.py | 12 +- tests.sh | 21 + 35 files changed, 11700 insertions(+), 1277 deletions(-) create mode 100644 .dockerignore create mode 100644 Dockerfile create mode 100644 docs/fse/index.html create mode 100644 docs/fse/inputs.html create mode 100644 docs/fse/models/average.html create mode 100644 docs/fse/models/average_inner.html create mode 100644 docs/fse/models/base_s2v.html create mode 100644 docs/fse/models/index.html create mode 100644 docs/fse/models/sentencevectors.html create mode 100644 docs/fse/models/sif.html create mode 100644 docs/fse/models/usif.html create mode 100644 docs/fse/models/utils.html create mode 100644 docs/fse/vectors.html create mode 100644 tests.sh diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..535cf1a --- /dev/null +++ b/.dockerignore @@ -0,0 +1,4 @@ +build +dist +*.c +*.so \ No newline at end of file diff --git a/.gitignore b/.gitignore index 37a54f4..b572b9f 100644 --- a/.gitignore +++ b/.gitignore @@ -71,7 +71,8 @@ fse*.egg-info *.old *.model *_out.txt -*.html vectors *.vectors *.joblib + +test_emb* \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..e2158d9 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,21 @@ +FROM python:3.6.0 + +RUN pip install -U pip + +RUN pip install scipy \ + smart_open \ + scikit-learn \ + wordfreq \ + huggingface-hub \ + psutil + +ARG gensim==4.0.0 +RUN pip install -U "gensim==$gensim" pytest coverage + +ADD . /home +WORKDIR /home +RUN rm -rf build dist + +RUN pip install -e . + +CMD [ "pytest", "-vv" ] \ No newline at end of file diff --git a/README.md b/README.md index 4965d3a..bb55b3c 100644 --- a/README.md +++ b/README.md @@ -151,7 +151,7 @@ gensim.models.keyedvectors.BaseKeyedVectors class, for example *Word2Vec* or *Fa ``` from gensim.models import FastText sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] -ft = FastText(sentences, min_count=1, size=10) +ft = FastText(sentences, min_count=1, vector_size=10) from fse import Average, IndexedList model = Average(ft) @@ -196,38 +196,38 @@ Results Model | Vectors | params | [STS Benchmark](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#Results) :---: | :---: | :---: | :---: `CBOW` | `paranmt-300` | | 79.82 -`uSIF` | `paranmt-300` | length=11 | 79.02 -`SIF-10` | `paranmt-300` | components=10 | 76.76 -`SIF-10` | `paragram-300-sl999` | components=10 | 74.27 -`SIF-10` | `paragram-300-ws353` | components=10 | 74.08 -`SIF-10` | `fasttext-crawl-subwords-300` | components=10 | 73.54 -`uSIF` | `paragram-300-sl999` | length=11 | 73.09 -`SIF-10` | `fasttext-wiki-news-subwords-300` | components=10 | 72.24 -`uSIF` | `paragram-300-ws353` | length=11 | 71.90 -`SIF-10` | `glove-twitter-200` | components=10 | 71.67 -`SIF-10` | `glove-wiki-gigaword-300` | components=10 | 71.43 -`SIF-10` | `word2vec-google-news-300` | components=10 | 71.17 -`SIF-10` | `glove-wiki-gigaword-200` | components=10 | 70.73 -`SIF-10` | `glove-twitter-100` | components=10 | 69.70 -`uSIF` | `fasttext-crawl-subwords-300` | length=11 | 69.55 -`uSIF` | `fasttext-wiki-news-subwords-300` | length=11 | 69.05 -`SIF-10` | `glove-wiki-gigaword-100` | components=10 | 68.43 -`uSIF` | `glove-wiki-gigaword-300` | length=11 | 67.73 -`uSIF` | `glove-wiki-gigaword-200` | length=11 | 67.26 -`uSIF` | `word2vec-google-news-300` | length=11 | 67.15 -`uSIF` | `glove-twitter-200` | length=11 | 66.73 -`SIF-10` | `glove-twitter-50` | components=10 | 65.57 -`uSIF` | `glove-wiki-gigaword-100` | length=11 | 65.48 -`uSIF` | `paragram-25` | length=11 | 64.31 -`uSIF` | `glove-twitter-100` | length=11 | 64.22 -`SIF-10` | `glove-wiki-gigaword-50` | components=10 | 64.20 -`uSIF` | `glove-wiki-gigaword-50` | length=11 | 62.22 +`uSIF` | `paranmt-300` | length=11 | 79.00 +`SIF-10` | `paranmt-300` | components=10 | 76.72 +`SIF-10` | `paragram-300-sl999` | components=10 | 74.21 +`SIF-10` | `paragram-300-ws353` | components=10 | 74.03 +`SIF-10` | `fasttext-crawl-subwords-300` | components=10 | 73.38 +`uSIF` | `paragram-300-sl999` | length=11 | 73.04 +`SIF-10` | `fasttext-wiki-news-subwords-300` | components=10 | 72.29 +`uSIF` | `paragram-300-ws353` | length=11 | 71.84 +`SIF-10` | `glove-twitter-200` | components=10 | 71.62 +`SIF-10` | `glove-wiki-gigaword-300` | components=10 | 71.35 +`SIF-10` | `word2vec-google-news-300` | components=10 | 71.12 +`SIF-10` | `glove-wiki-gigaword-200` | components=10 | 70.62 +`SIF-10` | `glove-twitter-100` | components=10 | 69.65 +`uSIF` | `fasttext-crawl-subwords-300` | length=11 | 69.40 +`uSIF` | `fasttext-wiki-news-subwords-300` | length=11 | 68.63 +`SIF-10` | `glove-wiki-gigaword-100` | components=10 | 68.34 +`uSIF` | `glove-wiki-gigaword-300` | length=11 | 67.60 +`uSIF` | `glove-wiki-gigaword-200` | length=11 | 67.11 +`uSIF` | `word2vec-google-news-300` | length=11 | 66.99 +`uSIF` | `glove-twitter-200` | length=11 | 66.67 +`SIF-10` | `glove-twitter-50` | components=10 | 65.52 +`uSIF` | `glove-wiki-gigaword-100` | length=11 | 65.33 +`uSIF` | `paragram-25` | length=11 | 64.22 +`uSIF` | `glove-twitter-100` | length=11 | 64.13 +`SIF-10` | `glove-wiki-gigaword-50` | components=10 | 64.11 +`uSIF` | `glove-wiki-gigaword-50` | length=11 | 62.06 `CBOW` | `word2vec-google-news-300` | | 61.54 -`uSIF` | `glove-twitter-50` | length=11 | 60.50 -`SIF-10` | `paragram-25` | components=10 | 59.22 -`uSIF` | `glove-twitter-25` | length=11 | 55.17 +`uSIF` | `glove-twitter-50` | length=11 | 60.41 +`SIF-10` | `paragram-25` | components=10 | 59.07 +`uSIF` | `glove-twitter-25` | length=11 | 55.06 `CBOW` | `paragram-300-ws353` | | 54.72 -`SIF-10` | `glove-twitter-25` | components=10 | 54.42 +`SIF-10` | `glove-twitter-25` | components=10 | 54.16 `CBOW` | `paragram-300-sl999` | | 51.46 `CBOW` | `fasttext-crawl-subwords-300` | | 48.49 `CBOW` | `glove-wiki-gigaword-300` | | 44.46 @@ -244,6 +244,11 @@ Model | Vectors | params | [STS Benchmark](http://ixa2.si.ehu.es/stswiki/index. Changelog ------------- +1.0.0: +- Added support for gensim>=4. This library is no longer compatible with gensim<4. For migration, see the [README](https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4). +- `size` argument is now `vector_size` +- Added docs + 0.2.0: - Added `Vectors` and `FTVectors` class and hub support by `from_pretrained` - Extended benchmark @@ -291,7 +296,7 @@ I am looking for active contributors to keep this package alive. Please feel fre Author: Oliver Borchers -Copyright (C) 2021 Oliver Borchers +Copyright (C) 2022 Oliver Borchers Citation ------------- diff --git a/docs/fse/index.html b/docs/fse/index.html new file mode 100644 index 0000000..dbd6450 --- /dev/null +++ b/docs/fse/index.html @@ -0,0 +1,161 @@ + + + + + + +fse API documentation + + + + + + + + + + + +
+
+
+

Package fse

+
+
+
+ +Expand source code + +
import logging
+
+from fse import models
+from fse.models import SIF, Average, SentenceVectors, uSIF
+from fse.vectors import FTVectors, Vectors
+
+from .inputs import (
+    BaseIndexedList,
+    CIndexedList,
+    CSplitCIndexedList,
+    CSplitIndexedList,
+    IndexedLineDocument,
+    IndexedList,
+    SplitCIndexedList,
+    SplitIndexedList,
+)
+
+
+class NullHandler(logging.Handler):
+    def emit(self, record):
+        pass
+
+
+logger = logging.getLogger("fse")
+if len(logger.handlers) == 0:  # To ensure reload() doesn't add another one
+    logger.addHandler(NullHandler())
+
+
+__version__ = "0.2.0"
+
+
+
+

Sub-modules

+
+
fse.inputs
+
+
+
+
fse.models
+
+
+
+
fse.vectors
+
+

Class to obtain BaseKeyedVector from.

+
+
+
+
+
+
+
+
+

Classes

+
+
+class NullHandler +(level=0) +
+
+

Handler instances dispatch logging events to specific destinations.

+

The base handler class. Acts as a placeholder which defines the Handler +interface. Handlers can optionally use Formatter instances to format +records as desired. By default, no formatter is specified; in this case, +the 'raw' message as determined by record.message is logged.

+

Initializes the instance - basically setting the formatter to None +and the filter list to empty.

+
+ +Expand source code + +
class NullHandler(logging.Handler):
+    def emit(self, record):
+        pass
+
+

Ancestors

+
    +
  • logging.Handler
  • +
  • logging.Filterer
  • +
+

Methods

+
+
+def emit(self, record) +
+
+

Do whatever it takes to actually log the specified logging record.

+

This version is intended to be implemented by subclasses and so +raises a NotImplementedError.

+
+ +Expand source code + +
def emit(self, record):
+    pass
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/fse/inputs.html b/docs/fse/inputs.html new file mode 100644 index 0000000..682a8f0 --- /dev/null +++ b/docs/fse/inputs.html @@ -0,0 +1,1256 @@ + + + + + + +fse.inputs API documentation + + + + + + + + + + + +
+
+
+

Module fse.inputs

+
+
+
+ +Expand source code + +
#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Author: Oliver Borchers
+# Copyright (C) Oliver Borchers
+
+from pathlib import Path
+from typing import List, MutableSequence, Union
+
+from gensim.utils import any2unicode
+from numpy import concatenate, ndarray
+from smart_open import open
+
+
+class BaseIndexedList(MutableSequence):
+    def __init__(self, *args: List[Union[list, set, ndarray]]):
+        """Base object to be used for feeding in-memory stored lists of sentences to the
+        training routine.
+
+        Parameters
+        ----------
+        args : lists, sets, ndarray
+            Arguments to be merged into a single contianer. Can be single or multiple list/set/ndarray objects.
+        """
+
+        self.items = list()
+
+        if len(args) == 1:
+            self._check_list_type(args[0])
+            self.items = args[0]
+        else:
+            for arg in args:
+                self.extend(arg)
+
+        super().__init__()
+
+    def _check_list_type(self, obj: object):
+        """Checks input validity."""
+        if isinstance(obj, (list, set, ndarray)):
+            return 1
+        else:
+            raise TypeError(f"Arg must be list/set type. Got {type(obj)}")
+
+    def _check_str_type(self, obj: object):
+        """Checks input validity."""
+        if isinstance(obj, str):
+            return 1
+        else:
+            raise TypeError(f"Arg must be str type. Got {type(obj)}")
+
+    def __len__(self):
+        """List length.
+
+        Returns
+        -------
+        int
+           Length of the IndexedList
+        """
+        return len(self.items)
+
+    def __str__(self):
+        """Human readable representation of the object's state, used for debugging.
+
+        Returns
+        -------
+        str
+           Human readable representation of the object's state (words and tags).
+        """
+        return str(self.items)
+
+    def __getitem__(self, i: int) -> tuple:
+        """Getitem method.
+
+        Returns
+        -------
+        tuple ([str], int)
+            Returns the core object, a tuple, for every sentence embedding model.
+        """
+        raise NotImplementedError()
+
+    def __delitem__(self, i: int):
+        """Delete an item."""
+        del self.items[i]
+
+    def __setitem__(self, i: int, item: str):
+        """Sets an item."""
+        self._check_str_type(item)
+        self.items[i] = item
+
+    def insert(self, i: int, item: str):
+        """Inserts an item at a position."""
+        self._check_str_type(item)
+        self.items.insert(i, item)
+
+    def append(self, item: str):
+        """Appends item at last position."""
+        self._check_str_type(item)
+        self.insert(len(self.items), item)
+
+    def extend(self, arg: Union[list, set, ndarray]):
+        """Extens list."""
+        self._check_list_type(arg)
+
+        if not isinstance(arg, ndarray):
+            self.items += arg
+        else:
+            self.items = concatenate([self.items, arg], axis=0)
+
+
+class IndexedList(BaseIndexedList):
+    def __init__(self, *args: Union[list, set, ndarray]):
+        """Quasi-list to be used for feeding in-memory stored lists of sentences to the
+        training routine.
+
+        Parameters
+        ----------
+        args : lists, sets, ndarray
+            Arguments to be merged into a single contianer. Can be single or multiple list/set objects.
+        """
+        super(IndexedList, self).__init__(*args)
+
+    def __getitem__(self, i: int) -> tuple:
+        """Getitem method.
+
+        Returns
+        -------
+        tuple
+            Returns the core object, tuple, for every sentence embedding model.
+        """
+        return (self.items.__getitem__(i), i)
+
+
+class CIndexedList(BaseIndexedList):
+    def __init__(
+        self, *args: Union[list, set, ndarray], custom_index: Union[list, ndarray]
+    ):
+        """Quasi-list with custom indices to be used for feeding in-memory stored lists
+        of sentences to the training routine.
+
+        Parameters
+        ----------
+        args : lists, sets, ndarray
+            Arguments to be merged into a single contianer. Can be single or multiple list/set objects.
+        custom_index : list, ndarray
+            Custom index to support many to one mappings.
+        """
+        self.custom_index = custom_index
+
+        super(CIndexedList, self).__init__(*args)
+
+        if len(self.items) != len(self.custom_index):
+            raise RuntimeError(
+                f"Size of custom_index {len(custom_index)} does not match items {len(self.items)}"
+            )
+
+    def __getitem__(self, i: int) -> tuple:
+        """Getitem method.
+
+        Returns
+        -------
+        tuple
+            Returns the core object, tuple, for every sentence embedding model.
+        """
+        return (self.items.__getitem__(i), self.custom_index[i])
+
+    def __delitem__(self, i: int):
+        raise NotImplementedError("Method currently not supported")
+
+    def __setitem__(self, i: int, item: str):
+        raise NotImplementedError("Method currently not supported")
+
+    def insert(self, i: int, item: str):
+        raise NotImplementedError("Method currently not supported")
+
+    def append(self, item: str):
+        raise NotImplementedError("Method currently not supported")
+
+    def extend(self, arg: Union[list, set, ndarray]):
+        raise NotImplementedError("Method currently not supported")
+
+
+class SplitIndexedList(BaseIndexedList):
+    def __init__(self, *args: Union[list, set, ndarray]):
+        """Quasi-list with string splitting to be used for feeding in-memory stored
+        lists of sentences to the training routine.
+
+        Parameters
+        ----------
+        args : lists, sets, ndarray
+            Arguments to be merged into a single contianer. Can be single or multiple list/set objects.
+        """
+        super(SplitIndexedList, self).__init__(*args)
+
+    def __getitem__(self, i: int) -> tuple:
+        """Getitem method.
+
+        Returns
+        -------
+        tuple
+            Returns the core object, tuple, for every sentence embedding model.
+        """
+        return (self.items.__getitem__(i).split(), i)
+
+
+class SplitCIndexedList(BaseIndexedList):
+    def __init__(
+        self, *args: Union[list, set, ndarray], custom_index: Union[list, ndarray]
+    ):
+        """Quasi-list with custom indices and string splitting to be used for feeding
+        in-memory stored lists of sentences to the training routine.
+
+        Parameters
+        ----------
+        args : lists, sets, ndarray
+            Arguments to be merged into a single contianer. Can be single or multiple list/set objects.
+        custom_index : list, ndarray
+            Custom index to support many to one mappings.
+        """
+        self.custom_index = custom_index
+
+        super(SplitCIndexedList, self).__init__(*args)
+
+        if len(self.items) != len(self.custom_index):
+            raise RuntimeError(
+                f"Size of custom_index {len(custom_index)} does not match items {len(self.items)}"
+            )
+
+    def __getitem__(self, i: int) -> tuple:
+        """Getitem method.
+
+        Returns
+        -------
+        tuple
+            Returns the core object, tuple, for every sentence embedding model.
+        """
+        return (self.items.__getitem__(i).split(), self.custom_index[i])
+
+    def __delitem__(self, i: int):
+        raise NotImplementedError("Method currently not supported")
+
+    def __setitem__(self, i: int, item: str):
+        raise NotImplementedError("Method currently not supported")
+
+    def insert(self, i: int, item: str):
+        raise NotImplementedError("Method currently not supported")
+
+    def append(self, item: str):
+        raise NotImplementedError("Method currently not supported")
+
+    def extend(self, arg: Union[list, set, ndarray]):
+        raise NotImplementedError("Method currently not supported")
+
+
+class CSplitIndexedList(BaseIndexedList):
+    def __init__(self, *args: Union[list, set, ndarray], custom_split: callable):
+        """Quasi-list with custom string splitting to be used for feeding in-memory
+        stored lists of sentences to the training routine.
+
+        Parameters
+        ----------
+        args : lists, sets, ndarray
+            Arguments to be merged into a single contianer. Can be single or multiple list/set objects.
+        custom_split : callable
+            Split function to be used to convert strings into list of str.
+        """
+        self.custom_split = custom_split
+        super(CSplitIndexedList, self).__init__(*args)
+
+    def __getitem__(self, i: int) -> tuple:
+        """Getitem method.
+
+        Returns
+        -------
+        tuple
+            Returns the core object, tuple, for every sentence embedding model.
+        """
+        return (self.custom_split(self.items.__getitem__(i)), i)
+
+
+class CSplitCIndexedList(BaseIndexedList):
+    def __init__(
+        self,
+        *args: Union[list, set, ndarray],
+        custom_split: callable,
+        custom_index: Union[list, ndarray],
+    ):
+        """Quasi-list with custom indices and ustom string splitting to be used for
+        feeding in-memory stored lists of sentences to the training routine.
+
+        Parameters
+        ----------
+        args : lists, sets, ndarray
+            Arguments to be merged into a single contianer. Can be single or multiple list/set objects.
+        custom_split : callable
+            Split function to be used to convert strings into list of str.
+        custom_index : list, ndarray
+            Custom index to support many to one mappings.
+        """
+        self.custom_split = custom_split
+        self.custom_index = custom_index
+
+        super(CSplitCIndexedList, self).__init__(*args)
+
+        if len(self.items) != len(self.custom_index):
+            raise RuntimeError(
+                f"Size of custom_index {len(custom_index)} does not match items {len(self.items)}"
+            )
+
+    def __getitem__(self, i: int) -> tuple:
+        """Getitem method.
+
+        Returns
+        -------
+        tuple
+            Returns the core object, tuple, for every sentence embedding model.
+        """
+        return (self.custom_split(self.items.__getitem__(i)), self.custom_index[i])
+
+    def __delitem__(self, i: int):
+        raise NotImplementedError("Method currently not supported")
+
+    def __setitem__(self, i: int, item: str):
+        raise NotImplementedError("Method currently not supported")
+
+    def insert(self, i: int, item: str):
+        raise NotImplementedError("Method currently not supported")
+
+    def append(self, item: str):
+        raise NotImplementedError("Method currently not supported")
+
+    def extend(self, arg: Union[list, set, ndarray]):
+        raise NotImplementedError("Method currently not supported")
+
+
+class IndexedLineDocument(object):
+    def __init__(self, path, get_able=True):
+        """Iterate over a file that contains sentences: one line = tuple([str], int).
+
+        Words are expected to be already preprocessed and separated by whitespace. Sentence tags are constructed
+        automatically from the sentence line number.
+
+        Parameters
+        ----------
+        path : str
+            The path of the file to read and return lines from
+        get_able : bool, optional
+            Use to determine if the IndexedLineDocument is indexable.
+            This functionality is required if you want to pass an indexable to
+            :meth:`~fse.models.sentencevectors.SentenceVectors.most_similar`.
+
+        """
+        self.path = Path(path)
+        self.line_offset = list()
+        self.get_able = bool(get_able)
+
+        if self.get_able:
+            self._build_offsets()
+
+    def _build_offsets(self):
+        """Builds an offset table to index the file."""
+        with open(self.path, "rb") as f:
+            offset = f.tell()
+            for line in f:
+                self.line_offset.append(offset)
+                offset += len(line)
+
+    def __getitem__(self, i):
+        """Returns the line indexed by i. Primarily used for.
+
+        :meth:`~fse.models.sentencevectors.SentenceVectors.most_similar`
+
+        Parameters
+        ----------
+        i : int
+            The line index used to index the file
+
+        Returns
+        -------
+        str
+            line at the current index
+        """
+        if not self.get_able:
+            raise RuntimeError(
+                "To index the lines, you must contruct with get_able=True"
+            )
+
+        with open(self.path, "rb") as f:
+            f.seek(self.line_offset[i])
+            output = f.readline()
+            f.seek(0)
+            return any2unicode(output).rstrip()
+
+    def __iter__(self):
+        """Iterate through the lines in the source.
+
+        Yields
+        ------
+        tuple : (list[str], int)
+            Tuple of list of string and index
+        """
+        with open(self.path, "rb") as f:
+            for i, line in enumerate(f):
+                yield (any2unicode(line).split(), i)
+
+
+
+
+
+
+
+
+
+

Classes

+
+
+class BaseIndexedList +(*args: List[Union[list, set, numpy.ndarray]]) +
+
+

All the operations on a read-only sequence.

+

Concrete subclasses must override new or init, +getitem, and len.

+

Base object to be used for feeding in-memory stored lists of sentences to the +training routine.

+

Parameters

+
+
args : lists, sets, ndarray
+
Arguments to be merged into a single contianer. Can be single or multiple list/set/ndarray objects.
+
+
+ +Expand source code + +
class BaseIndexedList(MutableSequence):
+    def __init__(self, *args: List[Union[list, set, ndarray]]):
+        """Base object to be used for feeding in-memory stored lists of sentences to the
+        training routine.
+
+        Parameters
+        ----------
+        args : lists, sets, ndarray
+            Arguments to be merged into a single contianer. Can be single or multiple list/set/ndarray objects.
+        """
+
+        self.items = list()
+
+        if len(args) == 1:
+            self._check_list_type(args[0])
+            self.items = args[0]
+        else:
+            for arg in args:
+                self.extend(arg)
+
+        super().__init__()
+
+    def _check_list_type(self, obj: object):
+        """Checks input validity."""
+        if isinstance(obj, (list, set, ndarray)):
+            return 1
+        else:
+            raise TypeError(f"Arg must be list/set type. Got {type(obj)}")
+
+    def _check_str_type(self, obj: object):
+        """Checks input validity."""
+        if isinstance(obj, str):
+            return 1
+        else:
+            raise TypeError(f"Arg must be str type. Got {type(obj)}")
+
+    def __len__(self):
+        """List length.
+
+        Returns
+        -------
+        int
+           Length of the IndexedList
+        """
+        return len(self.items)
+
+    def __str__(self):
+        """Human readable representation of the object's state, used for debugging.
+
+        Returns
+        -------
+        str
+           Human readable representation of the object's state (words and tags).
+        """
+        return str(self.items)
+
+    def __getitem__(self, i: int) -> tuple:
+        """Getitem method.
+
+        Returns
+        -------
+        tuple ([str], int)
+            Returns the core object, a tuple, for every sentence embedding model.
+        """
+        raise NotImplementedError()
+
+    def __delitem__(self, i: int):
+        """Delete an item."""
+        del self.items[i]
+
+    def __setitem__(self, i: int, item: str):
+        """Sets an item."""
+        self._check_str_type(item)
+        self.items[i] = item
+
+    def insert(self, i: int, item: str):
+        """Inserts an item at a position."""
+        self._check_str_type(item)
+        self.items.insert(i, item)
+
+    def append(self, item: str):
+        """Appends item at last position."""
+        self._check_str_type(item)
+        self.insert(len(self.items), item)
+
+    def extend(self, arg: Union[list, set, ndarray]):
+        """Extens list."""
+        self._check_list_type(arg)
+
+        if not isinstance(arg, ndarray):
+            self.items += arg
+        else:
+            self.items = concatenate([self.items, arg], axis=0)
+
+

Ancestors

+
    +
  • collections.abc.MutableSequence
  • +
  • collections.abc.Sequence
  • +
  • collections.abc.Reversible
  • +
  • collections.abc.Collection
  • +
  • collections.abc.Sized
  • +
  • collections.abc.Iterable
  • +
  • collections.abc.Container
  • +
  • typing.Generic
  • +
+

Subclasses

+ +

Methods

+
+
+def append(self, item: str) +
+
+

Appends item at last position.

+
+ +Expand source code + +
def append(self, item: str):
+    """Appends item at last position."""
+    self._check_str_type(item)
+    self.insert(len(self.items), item)
+
+
+
+def extend(self, arg: Union[list, set, numpy.ndarray]) +
+
+

Extens list.

+
+ +Expand source code + +
def extend(self, arg: Union[list, set, ndarray]):
+    """Extens list."""
+    self._check_list_type(arg)
+
+    if not isinstance(arg, ndarray):
+        self.items += arg
+    else:
+        self.items = concatenate([self.items, arg], axis=0)
+
+
+
+def insert(self, i: int, item: str) +
+
+

Inserts an item at a position.

+
+ +Expand source code + +
def insert(self, i: int, item: str):
+    """Inserts an item at a position."""
+    self._check_str_type(item)
+    self.items.insert(i, item)
+
+
+
+
+
+class CIndexedList +(*args: Union[list, set, numpy.ndarray], custom_index: Union[list, numpy.ndarray]) +
+
+

All the operations on a read-only sequence.

+

Concrete subclasses must override new or init, +getitem, and len.

+

Quasi-list with custom indices to be used for feeding in-memory stored lists +of sentences to the training routine.

+

Parameters

+
+
args : lists, sets, ndarray
+
Arguments to be merged into a single contianer. Can be single or multiple list/set objects.
+
custom_index : list, ndarray
+
Custom index to support many to one mappings.
+
+
+ +Expand source code + +
class CIndexedList(BaseIndexedList):
+    def __init__(
+        self, *args: Union[list, set, ndarray], custom_index: Union[list, ndarray]
+    ):
+        """Quasi-list with custom indices to be used for feeding in-memory stored lists
+        of sentences to the training routine.
+
+        Parameters
+        ----------
+        args : lists, sets, ndarray
+            Arguments to be merged into a single contianer. Can be single or multiple list/set objects.
+        custom_index : list, ndarray
+            Custom index to support many to one mappings.
+        """
+        self.custom_index = custom_index
+
+        super(CIndexedList, self).__init__(*args)
+
+        if len(self.items) != len(self.custom_index):
+            raise RuntimeError(
+                f"Size of custom_index {len(custom_index)} does not match items {len(self.items)}"
+            )
+
+    def __getitem__(self, i: int) -> tuple:
+        """Getitem method.
+
+        Returns
+        -------
+        tuple
+            Returns the core object, tuple, for every sentence embedding model.
+        """
+        return (self.items.__getitem__(i), self.custom_index[i])
+
+    def __delitem__(self, i: int):
+        raise NotImplementedError("Method currently not supported")
+
+    def __setitem__(self, i: int, item: str):
+        raise NotImplementedError("Method currently not supported")
+
+    def insert(self, i: int, item: str):
+        raise NotImplementedError("Method currently not supported")
+
+    def append(self, item: str):
+        raise NotImplementedError("Method currently not supported")
+
+    def extend(self, arg: Union[list, set, ndarray]):
+        raise NotImplementedError("Method currently not supported")
+
+

Ancestors

+
    +
  • BaseIndexedList
  • +
  • collections.abc.MutableSequence
  • +
  • collections.abc.Sequence
  • +
  • collections.abc.Reversible
  • +
  • collections.abc.Collection
  • +
  • collections.abc.Sized
  • +
  • collections.abc.Iterable
  • +
  • collections.abc.Container
  • +
  • typing.Generic
  • +
+

Inherited members

+ +
+
+class CSplitCIndexedList +(*args: Union[list, set, numpy.ndarray], custom_split: , custom_index: Union[list, numpy.ndarray]) +
+
+

All the operations on a read-only sequence.

+

Concrete subclasses must override new or init, +getitem, and len.

+

Quasi-list with custom indices and ustom string splitting to be used for +feeding in-memory stored lists of sentences to the training routine.

+

Parameters

+
+
args : lists, sets, ndarray
+
Arguments to be merged into a single contianer. Can be single or multiple list/set objects.
+
custom_split : callable
+
Split function to be used to convert strings into list of str.
+
custom_index : list, ndarray
+
Custom index to support many to one mappings.
+
+
+ +Expand source code + +
class CSplitCIndexedList(BaseIndexedList):
+    def __init__(
+        self,
+        *args: Union[list, set, ndarray],
+        custom_split: callable,
+        custom_index: Union[list, ndarray],
+    ):
+        """Quasi-list with custom indices and ustom string splitting to be used for
+        feeding in-memory stored lists of sentences to the training routine.
+
+        Parameters
+        ----------
+        args : lists, sets, ndarray
+            Arguments to be merged into a single contianer. Can be single or multiple list/set objects.
+        custom_split : callable
+            Split function to be used to convert strings into list of str.
+        custom_index : list, ndarray
+            Custom index to support many to one mappings.
+        """
+        self.custom_split = custom_split
+        self.custom_index = custom_index
+
+        super(CSplitCIndexedList, self).__init__(*args)
+
+        if len(self.items) != len(self.custom_index):
+            raise RuntimeError(
+                f"Size of custom_index {len(custom_index)} does not match items {len(self.items)}"
+            )
+
+    def __getitem__(self, i: int) -> tuple:
+        """Getitem method.
+
+        Returns
+        -------
+        tuple
+            Returns the core object, tuple, for every sentence embedding model.
+        """
+        return (self.custom_split(self.items.__getitem__(i)), self.custom_index[i])
+
+    def __delitem__(self, i: int):
+        raise NotImplementedError("Method currently not supported")
+
+    def __setitem__(self, i: int, item: str):
+        raise NotImplementedError("Method currently not supported")
+
+    def insert(self, i: int, item: str):
+        raise NotImplementedError("Method currently not supported")
+
+    def append(self, item: str):
+        raise NotImplementedError("Method currently not supported")
+
+    def extend(self, arg: Union[list, set, ndarray]):
+        raise NotImplementedError("Method currently not supported")
+
+

Ancestors

+
    +
  • BaseIndexedList
  • +
  • collections.abc.MutableSequence
  • +
  • collections.abc.Sequence
  • +
  • collections.abc.Reversible
  • +
  • collections.abc.Collection
  • +
  • collections.abc.Sized
  • +
  • collections.abc.Iterable
  • +
  • collections.abc.Container
  • +
  • typing.Generic
  • +
+

Inherited members

+ +
+
+class CSplitIndexedList +(*args: Union[list, set, numpy.ndarray], custom_split: ) +
+
+

All the operations on a read-only sequence.

+

Concrete subclasses must override new or init, +getitem, and len.

+

Quasi-list with custom string splitting to be used for feeding in-memory +stored lists of sentences to the training routine.

+

Parameters

+
+
args : lists, sets, ndarray
+
Arguments to be merged into a single contianer. Can be single or multiple list/set objects.
+
custom_split : callable
+
Split function to be used to convert strings into list of str.
+
+
+ +Expand source code + +
class CSplitIndexedList(BaseIndexedList):
+    def __init__(self, *args: Union[list, set, ndarray], custom_split: callable):
+        """Quasi-list with custom string splitting to be used for feeding in-memory
+        stored lists of sentences to the training routine.
+
+        Parameters
+        ----------
+        args : lists, sets, ndarray
+            Arguments to be merged into a single contianer. Can be single or multiple list/set objects.
+        custom_split : callable
+            Split function to be used to convert strings into list of str.
+        """
+        self.custom_split = custom_split
+        super(CSplitIndexedList, self).__init__(*args)
+
+    def __getitem__(self, i: int) -> tuple:
+        """Getitem method.
+
+        Returns
+        -------
+        tuple
+            Returns the core object, tuple, for every sentence embedding model.
+        """
+        return (self.custom_split(self.items.__getitem__(i)), i)
+
+

Ancestors

+
    +
  • BaseIndexedList
  • +
  • collections.abc.MutableSequence
  • +
  • collections.abc.Sequence
  • +
  • collections.abc.Reversible
  • +
  • collections.abc.Collection
  • +
  • collections.abc.Sized
  • +
  • collections.abc.Iterable
  • +
  • collections.abc.Container
  • +
  • typing.Generic
  • +
+

Inherited members

+ +
+
+class IndexedLineDocument +(path, get_able=True) +
+
+

Iterate over a file that contains sentences: one line = tuple([str], int).

+

Words are expected to be already preprocessed and separated by whitespace. Sentence tags are constructed +automatically from the sentence line number.

+

Parameters

+
+
path : str
+
The path of the file to read and return lines from
+
get_able : bool, optional
+
Use to determine if the IndexedLineDocument is indexable. +This functionality is required if you want to pass an indexable to +:meth:~fse.models.sentencevectors.SentenceVectors.most_similar.
+
+
+ +Expand source code + +
class IndexedLineDocument(object):
+    def __init__(self, path, get_able=True):
+        """Iterate over a file that contains sentences: one line = tuple([str], int).
+
+        Words are expected to be already preprocessed and separated by whitespace. Sentence tags are constructed
+        automatically from the sentence line number.
+
+        Parameters
+        ----------
+        path : str
+            The path of the file to read and return lines from
+        get_able : bool, optional
+            Use to determine if the IndexedLineDocument is indexable.
+            This functionality is required if you want to pass an indexable to
+            :meth:`~fse.models.sentencevectors.SentenceVectors.most_similar`.
+
+        """
+        self.path = Path(path)
+        self.line_offset = list()
+        self.get_able = bool(get_able)
+
+        if self.get_able:
+            self._build_offsets()
+
+    def _build_offsets(self):
+        """Builds an offset table to index the file."""
+        with open(self.path, "rb") as f:
+            offset = f.tell()
+            for line in f:
+                self.line_offset.append(offset)
+                offset += len(line)
+
+    def __getitem__(self, i):
+        """Returns the line indexed by i. Primarily used for.
+
+        :meth:`~fse.models.sentencevectors.SentenceVectors.most_similar`
+
+        Parameters
+        ----------
+        i : int
+            The line index used to index the file
+
+        Returns
+        -------
+        str
+            line at the current index
+        """
+        if not self.get_able:
+            raise RuntimeError(
+                "To index the lines, you must contruct with get_able=True"
+            )
+
+        with open(self.path, "rb") as f:
+            f.seek(self.line_offset[i])
+            output = f.readline()
+            f.seek(0)
+            return any2unicode(output).rstrip()
+
+    def __iter__(self):
+        """Iterate through the lines in the source.
+
+        Yields
+        ------
+        tuple : (list[str], int)
+            Tuple of list of string and index
+        """
+        with open(self.path, "rb") as f:
+            for i, line in enumerate(f):
+                yield (any2unicode(line).split(), i)
+
+
+
+class IndexedList +(*args: Union[list, set, numpy.ndarray]) +
+
+

All the operations on a read-only sequence.

+

Concrete subclasses must override new or init, +getitem, and len.

+

Quasi-list to be used for feeding in-memory stored lists of sentences to the +training routine.

+

Parameters

+
+
args : lists, sets, ndarray
+
Arguments to be merged into a single contianer. Can be single or multiple list/set objects.
+
+
+ +Expand source code + +
class IndexedList(BaseIndexedList):
+    def __init__(self, *args: Union[list, set, ndarray]):
+        """Quasi-list to be used for feeding in-memory stored lists of sentences to the
+        training routine.
+
+        Parameters
+        ----------
+        args : lists, sets, ndarray
+            Arguments to be merged into a single contianer. Can be single or multiple list/set objects.
+        """
+        super(IndexedList, self).__init__(*args)
+
+    def __getitem__(self, i: int) -> tuple:
+        """Getitem method.
+
+        Returns
+        -------
+        tuple
+            Returns the core object, tuple, for every sentence embedding model.
+        """
+        return (self.items.__getitem__(i), i)
+
+

Ancestors

+
    +
  • BaseIndexedList
  • +
  • collections.abc.MutableSequence
  • +
  • collections.abc.Sequence
  • +
  • collections.abc.Reversible
  • +
  • collections.abc.Collection
  • +
  • collections.abc.Sized
  • +
  • collections.abc.Iterable
  • +
  • collections.abc.Container
  • +
  • typing.Generic
  • +
+

Inherited members

+ +
+
+class SplitCIndexedList +(*args: Union[list, set, numpy.ndarray], custom_index: Union[list, numpy.ndarray]) +
+
+

All the operations on a read-only sequence.

+

Concrete subclasses must override new or init, +getitem, and len.

+

Quasi-list with custom indices and string splitting to be used for feeding +in-memory stored lists of sentences to the training routine.

+

Parameters

+
+
args : lists, sets, ndarray
+
Arguments to be merged into a single contianer. Can be single or multiple list/set objects.
+
custom_index : list, ndarray
+
Custom index to support many to one mappings.
+
+
+ +Expand source code + +
class SplitCIndexedList(BaseIndexedList):
+    def __init__(
+        self, *args: Union[list, set, ndarray], custom_index: Union[list, ndarray]
+    ):
+        """Quasi-list with custom indices and string splitting to be used for feeding
+        in-memory stored lists of sentences to the training routine.
+
+        Parameters
+        ----------
+        args : lists, sets, ndarray
+            Arguments to be merged into a single contianer. Can be single or multiple list/set objects.
+        custom_index : list, ndarray
+            Custom index to support many to one mappings.
+        """
+        self.custom_index = custom_index
+
+        super(SplitCIndexedList, self).__init__(*args)
+
+        if len(self.items) != len(self.custom_index):
+            raise RuntimeError(
+                f"Size of custom_index {len(custom_index)} does not match items {len(self.items)}"
+            )
+
+    def __getitem__(self, i: int) -> tuple:
+        """Getitem method.
+
+        Returns
+        -------
+        tuple
+            Returns the core object, tuple, for every sentence embedding model.
+        """
+        return (self.items.__getitem__(i).split(), self.custom_index[i])
+
+    def __delitem__(self, i: int):
+        raise NotImplementedError("Method currently not supported")
+
+    def __setitem__(self, i: int, item: str):
+        raise NotImplementedError("Method currently not supported")
+
+    def insert(self, i: int, item: str):
+        raise NotImplementedError("Method currently not supported")
+
+    def append(self, item: str):
+        raise NotImplementedError("Method currently not supported")
+
+    def extend(self, arg: Union[list, set, ndarray]):
+        raise NotImplementedError("Method currently not supported")
+
+

Ancestors

+
    +
  • BaseIndexedList
  • +
  • collections.abc.MutableSequence
  • +
  • collections.abc.Sequence
  • +
  • collections.abc.Reversible
  • +
  • collections.abc.Collection
  • +
  • collections.abc.Sized
  • +
  • collections.abc.Iterable
  • +
  • collections.abc.Container
  • +
  • typing.Generic
  • +
+

Inherited members

+ +
+
+class SplitIndexedList +(*args: Union[list, set, numpy.ndarray]) +
+
+

All the operations on a read-only sequence.

+

Concrete subclasses must override new or init, +getitem, and len.

+

Quasi-list with string splitting to be used for feeding in-memory stored +lists of sentences to the training routine.

+

Parameters

+
+
args : lists, sets, ndarray
+
Arguments to be merged into a single contianer. Can be single or multiple list/set objects.
+
+
+ +Expand source code + +
class SplitIndexedList(BaseIndexedList):
+    def __init__(self, *args: Union[list, set, ndarray]):
+        """Quasi-list with string splitting to be used for feeding in-memory stored
+        lists of sentences to the training routine.
+
+        Parameters
+        ----------
+        args : lists, sets, ndarray
+            Arguments to be merged into a single contianer. Can be single or multiple list/set objects.
+        """
+        super(SplitIndexedList, self).__init__(*args)
+
+    def __getitem__(self, i: int) -> tuple:
+        """Getitem method.
+
+        Returns
+        -------
+        tuple
+            Returns the core object, tuple, for every sentence embedding model.
+        """
+        return (self.items.__getitem__(i).split(), i)
+
+

Ancestors

+
    +
  • BaseIndexedList
  • +
  • collections.abc.MutableSequence
  • +
  • collections.abc.Sequence
  • +
  • collections.abc.Reversible
  • +
  • collections.abc.Collection
  • +
  • collections.abc.Sized
  • +
  • collections.abc.Iterable
  • +
  • collections.abc.Container
  • +
  • typing.Generic
  • +
+

Inherited members

+ +
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/fse/models/average.html b/docs/fse/models/average.html new file mode 100644 index 0000000..58fe77d --- /dev/null +++ b/docs/fse/models/average.html @@ -0,0 +1,715 @@ + + + + + + +fse.models.average API documentation + + + + + + + + + + + +
+
+
+

Module fse.models.average

+
+
+

This module implements the base class to compute average representations for sentences, using highly optimized C routines, +data streaming and Pythonic interfaces.

+

The implementation is based on Iyyer et al. (2015): Deep Unordered Composition Rivals Syntactic Methods for Text Classification. +For more information, see https://people.cs.umass.edu/~miyyer/pubs/2015_acl_dan.pdf.

+

The training algorithms is based on the Gensim implementation of Word2Vec, FastText, and Doc2Vec. +For more information, see: :class:~gensim.models.word2vec.Word2Vec, :class:~gensim.models.fasttext.FastText, or +:class:~gensim.models.doc2vec.Doc2Vec.

+

Initialize and train a :class:~fse.models.sentence2vec.Sentence2Vec model

+
+

Sourcecode: pycon

+
+
+
+

from gensim.models.word2vec import Word2Vec +sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] +model = Word2Vec(sentences, min_count=1, vector_size=20)

+

from fse.models.average import Average +
+avg = Average(model) +avg.train([(s, i) for i, s in enumerate(sentences)]) +avg.sv.vectors.shape +(2, 20)

+
+
+
+
+
+ +Expand source code + +
#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Author: Oliver Borchers
+# Copyright (C) Oliver Borchers
+
+"""This module implements the base class to compute average representations for sentences, using highly optimized C routines,
+data streaming and Pythonic interfaces.
+
+The implementation is based on Iyyer et al. (2015): Deep Unordered Composition Rivals Syntactic Methods for Text Classification.
+For more information, see <https://people.cs.umass.edu/~miyyer/pubs/2015_acl_dan.pdf>.
+
+The training algorithms is based on the Gensim implementation of Word2Vec, FastText, and Doc2Vec. 
+For more information, see: :class:`~gensim.models.word2vec.Word2Vec`, :class:`~gensim.models.fasttext.FastText`, or
+:class:`~gensim.models.doc2vec.Doc2Vec`.
+
+Initialize and train a :class:`~fse.models.sentence2vec.Sentence2Vec` model
+
+.. sourcecode:: pycon
+
+        >>> from gensim.models.word2vec import Word2Vec
+        >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
+        >>> model = Word2Vec(sentences, min_count=1, vector_size=20)
+
+        >>> from fse.models.average import Average        
+        >>> avg = Average(model)
+        >>> avg.train([(s, i) for i, s in enumerate(sentences)])
+        >>> avg.sv.vectors.shape
+        (2, 20)
+
+"""
+
+from __future__ import division
+
+from fse.models.base_s2v import BaseSentence2VecModel
+
+from gensim.models.keyedvectors import KeyedVectors
+from gensim.models.fasttext import ft_ngram_hashes
+
+from numpy import (
+    ndarray,
+    float32 as REAL,
+    sum as np_sum,
+    multiply as np_mult,
+    zeros,
+    max as np_max,
+)
+
+from typing import List, Tuple
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def train_average_np(
+    model: BaseSentence2VecModel,
+    indexed_sentences: List[tuple],
+    target: ndarray,
+    memory: ndarray,
+) -> Tuple[int, int]:
+    """Training on a sequence of sentences and update the target ndarray.
+
+    Called internally from :meth:`~fse.models.average.Average._do_train_job`.
+
+    Warnings
+    --------
+    This is the non-optimized, pure Python version. If you have a C compiler,
+    fse will use an optimized code path from :mod:`fse.models.average_inner` instead.
+
+    Parameters
+    ----------
+    model : :class:`~fse.models.base_s2v.BaseSentence2VecModel`
+        The BaseSentence2VecModel model instance.
+    indexed_sentences : iterable of tuple
+        The sentences used to train the model.
+    target : ndarray
+        The target ndarray. We use the index from indexed_sentences
+        to write into the corresponding row of target.
+    memory : ndarray
+        Private memory for each working thread
+
+    Returns
+    -------
+    int, int
+        Number of effective sentences (non-zero) and effective words in the vocabulary used
+        during training the sentence embedding.
+
+    """
+    size = model.wv.vector_size
+
+    w_vectors = model.wv.vectors
+    w_weights = model.word_weights
+
+    s_vectors = target
+
+    is_ft = model.is_ft
+
+    mem = memory[0]
+
+    if is_ft:
+        # NOTE: For Fasttext: Use wv.vectors_vocab
+        # Using the wv.vectors from fasttext had horrible effects on the sts results
+        # I suspect this is because the wv.vectors are based on the averages of
+        # wv.vectors_vocab + wv.vectors_ngrams, which will all point into very
+        # similar directions.
+        max_ngrams = model.batch_ngrams
+        w_vectors = model.wv.vectors_vocab
+        ngram_vectors = model.wv.vectors_ngrams
+        min_n = model.wv.min_n
+        max_n = model.wv.max_n
+        bucket = model.wv.bucket
+        oov_weight = np_max(w_weights)
+
+    eff_sentences, eff_words = 0, 0
+
+    if not is_ft:
+        for obj in indexed_sentences:
+            mem.fill(0.0)
+            sent = obj[0]
+            sent_adr = obj[1]
+
+            word_indices = [
+                model.wv.key_to_index[word]
+                for word in sent
+                if word in model.wv.key_to_index
+            ]
+            eff_sentences += 1
+            if not len(word_indices):
+                continue
+            eff_words += len(word_indices)
+
+            mem += np_sum(
+                np_mult(w_vectors[word_indices], w_weights[word_indices][:, None]),
+                axis=0,
+            )
+            mem *= 1 / len(word_indices)
+            s_vectors[sent_adr] = mem.astype(REAL)
+    else:
+        for obj in indexed_sentences:
+            mem.fill(0.0)
+            sent = obj[0]
+            sent_adr = obj[1]
+
+            if not len(sent):
+                continue
+            mem = zeros(size, dtype=REAL)
+
+            eff_sentences += 1
+            eff_words += len(sent)  # Counts everything in the sentence
+
+            for word in sent:
+                if word in model.wv.key_to_index:
+                    word_index = model.wv.key_to_index[word]
+                    mem += w_vectors[word_index] * w_weights[word_index]
+                else:
+                    ngram_hashes = ft_ngram_hashes(word, min_n, max_n, bucket)[
+                        :max_ngrams
+                    ]
+                    if len(ngram_hashes) == 0:
+                        continue
+                    mem += oov_weight * (
+                        np_sum(ngram_vectors[ngram_hashes], axis=0) / len(ngram_hashes)
+                    )
+                # Implicit addition of zero if oov does not contain any ngrams
+            s_vectors[sent_adr] = mem / len(sent)
+
+    return eff_sentences, eff_words
+
+
+try:
+    from fse.models.average_inner import train_average_cy
+    from fse.models.average_inner import (
+        FAST_VERSION,
+        MAX_WORDS_IN_BATCH,
+        MAX_NGRAMS_IN_BATCH,
+    )
+
+    train_average = train_average_cy
+except ImportError:
+    FAST_VERSION = -1
+    MAX_WORDS_IN_BATCH = 10000
+    MAX_NGRAMS_IN_BATCH = 40
+    train_average = train_average_np
+
+
+class Average(BaseSentence2VecModel):
+    """Train, use and evaluate averaged sentence vectors.
+
+    The model can be stored/loaded via its :meth:`~fse.models.average.Average.save` and
+    :meth:`~fse.models.average.Average.load` methods.
+
+    Some important attributes are the following:
+
+    Attributes
+    ----------
+    wv : :class:`~gensim.models.keyedvectors.KeyedVectors`
+        This object essentially contains the mapping between words and embeddings. After training, it can be used
+        directly to query those embeddings in various ways. See the module level docstring for examples.
+
+    sv : :class:`~fse.models.sentencevectors.SentenceVectors`
+        This object contains the sentence vectors inferred from the training data. There will be one such vector
+        for each unique docusentence supplied during training. They may be individually accessed using the index.
+
+    prep : :class:`~fse.models.base_s2v.BaseSentence2VecPreparer`
+        The prep object is used to transform and initialize the sv.vectors. Aditionally, it can be used
+        to move the vectors to disk for training with memmap.
+
+    """
+
+    def __init__(
+        self,
+        model: KeyedVectors,
+        sv_mapfile_path: str = None,
+        wv_mapfile_path: str = None,
+        workers: int = 1,
+        lang_freq: str = None,
+        **kwargs
+    ):
+        """Average (unweighted) sentence embeddings model. Performs a simple averaging operation over all
+        words in a sentences without further transformation.
+
+        The implementation is based on Iyyer et al. (2015): Deep Unordered Composition Rivals Syntactic Methods for Text Classification.
+        For more information, see <https://people.cs.umass.edu/~miyyer/pubs/2015_acl_dan.pdf>.
+
+        Parameters
+        ----------
+        model : :class:`~gensim.models.keyedvectors.KeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
+            This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings
+            the wv.vocab and wv.vector elements are required.
+        sv_mapfile_path : str, optional
+            Optional path to store the sentence-vectors in for very large datasets. Used for memmap.
+        wv_mapfile_path : str, optional
+            Optional path to store the word-vectors in for very large datasets. Used for memmap.
+            Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram.
+        workers : int, optional
+            Number of working threads, used for multithreading. For most tasks (few words in a sentence)
+            a value of 1 should be more than enough.
+        lang_freq : str, optional
+            Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about
+            the frequency of a word. As the frequency is required for estimating the word weights, we induce
+            frequencies into the wv.vocab.count based on :class:`~wordfreq`
+            If no frequency information is available, you can choose the language to estimate the frequency.
+            See https://github.com/LuminosoInsight/wordfreq
+
+        """
+
+        super(Average, self).__init__(
+            model=model,
+            sv_mapfile_path=sv_mapfile_path,
+            wv_mapfile_path=wv_mapfile_path,
+            workers=workers,
+            lang_freq=lang_freq,
+            batch_words=MAX_WORDS_IN_BATCH,
+            batch_ngrams=MAX_NGRAMS_IN_BATCH,
+            fast_version=FAST_VERSION,
+        )
+
+    def _do_train_job(
+        self, data_iterable: List[tuple], target: ndarray, memory: ndarray
+    ) -> Tuple[int, int]:
+        """ Internal routine which is called on training and performs averaging for all entries in the iterable """
+        eff_sentences, eff_words = train_average(
+            model=self, indexed_sentences=data_iterable, target=target, memory=memory
+        )
+        return eff_sentences, eff_words
+
+    def _check_parameter_sanity(self, **kwargs):
+        """ Check the sanity of all child paramters """
+        if not all(self.word_weights == 1.0):
+            raise ValueError("All word weights must equal one for averaging")
+
+    def _pre_train_calls(self, **kwargs):
+        """Function calls to perform before training """
+        pass
+
+    def _post_train_calls(self, **kwargs):
+        """ Function calls to perform after training, such as computing eigenvectors """
+        pass
+
+    def _post_inference_calls(self, **kwargs):
+        """Function calls to perform after training & inference
+        Examples include the removal of components
+        """
+        pass
+
+    def _check_dtype_santiy(self, **kwargs):
+        """ Check the dtypes of all child attributes"""
+        pass
+
+
+
+
+
+
+
+

Functions

+
+
+def train_average_np(model: BaseSentence2VecModel, indexed_sentences: List[tuple], target: numpy.ndarray, memory: numpy.ndarray) ‑> Tuple[int, int] +
+
+

Training on a sequence of sentences and update the target ndarray.

+

Called internally from :meth:~fse.models.average.Average._do_train_job.

+

Warnings

+

This is the non-optimized, pure Python version. If you have a C compiler, +fse will use an optimized code path from :mod:fse.models.average_inner instead.

+

Parameters

+
+
model : :class:~fse.models.base_s2v.BaseSentence2VecModel``
+
The BaseSentence2VecModel model instance.
+
indexed_sentences : iterable of tuple
+
The sentences used to train the model.
+
target : ndarray
+
The target ndarray. We use the index from indexed_sentences +to write into the corresponding row of target.
+
memory : ndarray
+
Private memory for each working thread
+
+

Returns

+
+
int, int
+
Number of effective sentences (non-zero) and effective words in the vocabulary used +during training the sentence embedding.
+
+
+ +Expand source code + +
def train_average_np(
+    model: BaseSentence2VecModel,
+    indexed_sentences: List[tuple],
+    target: ndarray,
+    memory: ndarray,
+) -> Tuple[int, int]:
+    """Training on a sequence of sentences and update the target ndarray.
+
+    Called internally from :meth:`~fse.models.average.Average._do_train_job`.
+
+    Warnings
+    --------
+    This is the non-optimized, pure Python version. If you have a C compiler,
+    fse will use an optimized code path from :mod:`fse.models.average_inner` instead.
+
+    Parameters
+    ----------
+    model : :class:`~fse.models.base_s2v.BaseSentence2VecModel`
+        The BaseSentence2VecModel model instance.
+    indexed_sentences : iterable of tuple
+        The sentences used to train the model.
+    target : ndarray
+        The target ndarray. We use the index from indexed_sentences
+        to write into the corresponding row of target.
+    memory : ndarray
+        Private memory for each working thread
+
+    Returns
+    -------
+    int, int
+        Number of effective sentences (non-zero) and effective words in the vocabulary used
+        during training the sentence embedding.
+
+    """
+    size = model.wv.vector_size
+
+    w_vectors = model.wv.vectors
+    w_weights = model.word_weights
+
+    s_vectors = target
+
+    is_ft = model.is_ft
+
+    mem = memory[0]
+
+    if is_ft:
+        # NOTE: For Fasttext: Use wv.vectors_vocab
+        # Using the wv.vectors from fasttext had horrible effects on the sts results
+        # I suspect this is because the wv.vectors are based on the averages of
+        # wv.vectors_vocab + wv.vectors_ngrams, which will all point into very
+        # similar directions.
+        max_ngrams = model.batch_ngrams
+        w_vectors = model.wv.vectors_vocab
+        ngram_vectors = model.wv.vectors_ngrams
+        min_n = model.wv.min_n
+        max_n = model.wv.max_n
+        bucket = model.wv.bucket
+        oov_weight = np_max(w_weights)
+
+    eff_sentences, eff_words = 0, 0
+
+    if not is_ft:
+        for obj in indexed_sentences:
+            mem.fill(0.0)
+            sent = obj[0]
+            sent_adr = obj[1]
+
+            word_indices = [
+                model.wv.key_to_index[word]
+                for word in sent
+                if word in model.wv.key_to_index
+            ]
+            eff_sentences += 1
+            if not len(word_indices):
+                continue
+            eff_words += len(word_indices)
+
+            mem += np_sum(
+                np_mult(w_vectors[word_indices], w_weights[word_indices][:, None]),
+                axis=0,
+            )
+            mem *= 1 / len(word_indices)
+            s_vectors[sent_adr] = mem.astype(REAL)
+    else:
+        for obj in indexed_sentences:
+            mem.fill(0.0)
+            sent = obj[0]
+            sent_adr = obj[1]
+
+            if not len(sent):
+                continue
+            mem = zeros(size, dtype=REAL)
+
+            eff_sentences += 1
+            eff_words += len(sent)  # Counts everything in the sentence
+
+            for word in sent:
+                if word in model.wv.key_to_index:
+                    word_index = model.wv.key_to_index[word]
+                    mem += w_vectors[word_index] * w_weights[word_index]
+                else:
+                    ngram_hashes = ft_ngram_hashes(word, min_n, max_n, bucket)[
+                        :max_ngrams
+                    ]
+                    if len(ngram_hashes) == 0:
+                        continue
+                    mem += oov_weight * (
+                        np_sum(ngram_vectors[ngram_hashes], axis=0) / len(ngram_hashes)
+                    )
+                # Implicit addition of zero if oov does not contain any ngrams
+            s_vectors[sent_adr] = mem / len(sent)
+
+    return eff_sentences, eff_words
+
+
+
+
+
+

Classes

+
+
+class Average +(model: gensim.models.keyedvectors.KeyedVectors, sv_mapfile_path: str = None, wv_mapfile_path: str = None, workers: int = 1, lang_freq: str = None, **kwargs) +
+
+

Train, use and evaluate averaged sentence vectors.

+

The model can be stored/loaded via its :meth:~fse.models.average.Average.save and +:meth:~fse.models.average.Average.load methods.

+

Some important attributes are the following:

+

Attributes

+
+
wv : :class:~gensim.models.keyedvectors.KeyedVectors``
+
This object essentially contains the mapping between words and embeddings. After training, it can be used +directly to query those embeddings in various ways. See the module level docstring for examples.
+
sv : :class:~fse.models.sentencevectors.SentenceVectors``
+
This object contains the sentence vectors inferred from the training data. There will be one such vector +for each unique docusentence supplied during training. They may be individually accessed using the index.
+
prep : :class:~fse.models.base_s2v.BaseSentence2VecPreparer``
+
The prep object is used to transform and initialize the sv.vectors. Aditionally, it can be used +to move the vectors to disk for training with memmap.
+
+

Average (unweighted) sentence embeddings model. Performs a simple averaging operation over all +words in a sentences without further transformation.

+

The implementation is based on Iyyer et al. (2015): Deep Unordered Composition Rivals Syntactic Methods for Text Classification. +For more information, see https://people.cs.umass.edu/~miyyer/pubs/2015_acl_dan.pdf.

+

Parameters

+
+
model : :class:~gensim.models.keyedvectors.KeyedVectorsor `:class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel
+
This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings +the wv.vocab and wv.vector elements are required.
+
sv_mapfile_path : str, optional
+
Optional path to store the sentence-vectors in for very large datasets. Used for memmap.
+
wv_mapfile_path : str, optional
+
Optional path to store the word-vectors in for very large datasets. Used for memmap. +Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram.
+
workers : int, optional
+
Number of working threads, used for multithreading. For most tasks (few words in a sentence) +a value of 1 should be more than enough.
+
lang_freq : str, optional
+
Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about +the frequency of a word. As the frequency is required for estimating the word weights, we induce +frequencies into the wv.vocab.count based on :class:~wordfreq +If no frequency information is available, you can choose the language to estimate the frequency. +See https://github.com/LuminosoInsight/wordfreq
+
+
+ +Expand source code + +
class Average(BaseSentence2VecModel):
+    """Train, use and evaluate averaged sentence vectors.
+
+    The model can be stored/loaded via its :meth:`~fse.models.average.Average.save` and
+    :meth:`~fse.models.average.Average.load` methods.
+
+    Some important attributes are the following:
+
+    Attributes
+    ----------
+    wv : :class:`~gensim.models.keyedvectors.KeyedVectors`
+        This object essentially contains the mapping between words and embeddings. After training, it can be used
+        directly to query those embeddings in various ways. See the module level docstring for examples.
+
+    sv : :class:`~fse.models.sentencevectors.SentenceVectors`
+        This object contains the sentence vectors inferred from the training data. There will be one such vector
+        for each unique docusentence supplied during training. They may be individually accessed using the index.
+
+    prep : :class:`~fse.models.base_s2v.BaseSentence2VecPreparer`
+        The prep object is used to transform and initialize the sv.vectors. Aditionally, it can be used
+        to move the vectors to disk for training with memmap.
+
+    """
+
+    def __init__(
+        self,
+        model: KeyedVectors,
+        sv_mapfile_path: str = None,
+        wv_mapfile_path: str = None,
+        workers: int = 1,
+        lang_freq: str = None,
+        **kwargs
+    ):
+        """Average (unweighted) sentence embeddings model. Performs a simple averaging operation over all
+        words in a sentences without further transformation.
+
+        The implementation is based on Iyyer et al. (2015): Deep Unordered Composition Rivals Syntactic Methods for Text Classification.
+        For more information, see <https://people.cs.umass.edu/~miyyer/pubs/2015_acl_dan.pdf>.
+
+        Parameters
+        ----------
+        model : :class:`~gensim.models.keyedvectors.KeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
+            This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings
+            the wv.vocab and wv.vector elements are required.
+        sv_mapfile_path : str, optional
+            Optional path to store the sentence-vectors in for very large datasets. Used for memmap.
+        wv_mapfile_path : str, optional
+            Optional path to store the word-vectors in for very large datasets. Used for memmap.
+            Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram.
+        workers : int, optional
+            Number of working threads, used for multithreading. For most tasks (few words in a sentence)
+            a value of 1 should be more than enough.
+        lang_freq : str, optional
+            Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about
+            the frequency of a word. As the frequency is required for estimating the word weights, we induce
+            frequencies into the wv.vocab.count based on :class:`~wordfreq`
+            If no frequency information is available, you can choose the language to estimate the frequency.
+            See https://github.com/LuminosoInsight/wordfreq
+
+        """
+
+        super(Average, self).__init__(
+            model=model,
+            sv_mapfile_path=sv_mapfile_path,
+            wv_mapfile_path=wv_mapfile_path,
+            workers=workers,
+            lang_freq=lang_freq,
+            batch_words=MAX_WORDS_IN_BATCH,
+            batch_ngrams=MAX_NGRAMS_IN_BATCH,
+            fast_version=FAST_VERSION,
+        )
+
+    def _do_train_job(
+        self, data_iterable: List[tuple], target: ndarray, memory: ndarray
+    ) -> Tuple[int, int]:
+        """ Internal routine which is called on training and performs averaging for all entries in the iterable """
+        eff_sentences, eff_words = train_average(
+            model=self, indexed_sentences=data_iterable, target=target, memory=memory
+        )
+        return eff_sentences, eff_words
+
+    def _check_parameter_sanity(self, **kwargs):
+        """ Check the sanity of all child paramters """
+        if not all(self.word_weights == 1.0):
+            raise ValueError("All word weights must equal one for averaging")
+
+    def _pre_train_calls(self, **kwargs):
+        """Function calls to perform before training """
+        pass
+
+    def _post_train_calls(self, **kwargs):
+        """ Function calls to perform after training, such as computing eigenvectors """
+        pass
+
+    def _post_inference_calls(self, **kwargs):
+        """Function calls to perform after training & inference
+        Examples include the removal of components
+        """
+        pass
+
+    def _check_dtype_santiy(self, **kwargs):
+        """ Check the dtypes of all child attributes"""
+        pass
+
+

Ancestors

+ +

Subclasses

+ +

Inherited members

+ +
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/fse/models/average_inner.html b/docs/fse/models/average_inner.html new file mode 100644 index 0000000..c9dba90 --- /dev/null +++ b/docs/fse/models/average_inner.html @@ -0,0 +1,95 @@ + + + + + + +fse.models.average_inner API documentation + + + + + + + + + + + +
+
+
+

Module fse.models.average_inner

+
+
+

Optimized cython functions for computing sentence embeddings

+
+
+
+
+
+
+

Functions

+
+
+def init() +
+
+
+
+
+def train_average_cy(model, indexed_sentences, target, memory) +
+
+

Training on a sequence of sentences and update the target ndarray.

+
Called internally from :meth:`~fse.models.average.Average._do_train_job`.
+
+Parameters
+----------
+model : :class:`~fse.models.base_s2v.BaseSentence2VecModel`
+    The BaseSentence2VecModel model instance.
+indexed_sentences : iterable of tuple
+    The sentences used to train the model.
+target : ndarray
+    The target ndarray. We use the index from indexed_sentences
+    to write into the corresponding row of target.
+memory : ndarray
+    Private memory for each working thread.
+
+Returns
+-------
+int, int
+    Number of effective sentences (non-zero) and effective words in the vocabulary used 
+    during training the sentence embedding.
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/fse/models/base_s2v.html b/docs/fse/models/base_s2v.html new file mode 100644 index 0000000..b08dbbf --- /dev/null +++ b/docs/fse/models/base_s2v.html @@ -0,0 +1,2636 @@ + + + + + + +fse.models.base_s2v API documentation + + + + + + + + + + + +
+
+
+

Module fse.models.base_s2v

+
+
+

Base class containing common methods for training, using & evaluating sentence embeddings. +A lot of the code is based on Gensim. I have to thank Radim Rehurek and the whole team +for the outstanding library which I used for a lot of my research.

+

Attributes

+
+
wv : :class:~gensim.models.keyedvectors.KeyedVectors``
+
This object essentially contains the mapping between words and embeddings. After training, it can be used +directly to query those embeddings in various ways. See the module level docstring for examples.
+
sv : :class:~fse.models.sentencevectors.SentenceVectors``
+
This object contains the sentence vectors inferred from the training data. There will be one such vector +for each unique docusentence supplied during training. They may be individually accessed using the index.
+
prep : :class:~fse.models.base_s2v.BaseSentence2VecPreparer``
+
The prep object is used to transform and initialize the sv.vectors. Aditionally, it can be used +to move the vectors to disk for training with memmap.
+
+

See Also

+

:class:~fse.models.average.Average. Average sentence model. +:class:~fse.models.sif.SIF. Smooth inverse frequency weighted model. +:class:~fse.models.usif.uSIF. Unsupervised Smooth inverse frequency weighted model.

+
+ +Expand source code + +
#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Author: Oliver Borchers
+# Copyright (C) Oliver Borchers
+# Licensed under GNU General Public License v3.0
+
+"""Base class containing common methods for training, using & evaluating sentence embeddings.
+A lot of the code is based on Gensim. I have to thank Radim Rehurek and the whole team
+for the outstanding library which I used for a lot of my research.
+
+Attributes
+----------
+wv : :class:`~gensim.models.keyedvectors.KeyedVectors`
+    This object essentially contains the mapping between words and embeddings. After training, it can be used
+    directly to query those embeddings in various ways. See the module level docstring for examples.
+
+sv : :class:`~fse.models.sentencevectors.SentenceVectors`
+    This object contains the sentence vectors inferred from the training data. There will be one such vector
+    for each unique docusentence supplied during training. They may be individually accessed using the index.
+
+prep : :class:`~fse.models.base_s2v.BaseSentence2VecPreparer`
+    The prep object is used to transform and initialize the sv.vectors. Aditionally, it can be used
+    to move the vectors to disk for training with memmap.
+
+See Also
+--------
+:class:`~fse.models.average.Average`.
+    Average sentence model.
+:class:`~fse.models.sif.SIF`.
+    Smooth inverse frequency weighted model.
+:class:`~fse.models.usif.uSIF`.
+    Unsupervised Smooth inverse frequency weighted model.
+
+"""
+
+from fse.models.sentencevectors import SentenceVectors, _l2_norm
+
+from fse.models.utils import set_madvise_for_mmap
+
+from gensim.models import Word2Vec, FastText
+from gensim.models.keyedvectors import KeyedVectors
+from gensim.models.fasttext import FastTextKeyedVectors
+from gensim.utils import SaveLoad
+from gensim.matutils import zeros_aligned
+
+from numpy import (
+    ndarray,
+    memmap as np_memmap,
+    float32 as REAL,
+    uint32 as uINT,
+    empty,
+    zeros,
+    vstack,
+    dtype,
+    ones,
+    finfo,
+    full,
+)
+
+from wordfreq import available_languages, get_frequency_dict
+
+from typing import List, Dict, Tuple
+
+from time import time
+from psutil import virtual_memory
+
+from pathlib import Path
+
+import logging
+import warnings
+
+import threading
+
+from queue import Queue
+
+logger = logging.getLogger(__name__)
+
+EPS = finfo(REAL).eps
+
+
+class BaseSentence2VecModel(SaveLoad):
+    def __init__(
+        self,
+        model: KeyedVectors,
+        sv_mapfile_path: str = None,
+        wv_mapfile_path: str = None,
+        workers: int = 1,
+        lang_freq: str = None,
+        fast_version: int = 0,
+        batch_words: int = 10000,
+        batch_ngrams: int = 40,
+        **kwargs,
+    ):
+        """Base class for all Sentence2Vec Models. Provides core functionality, such as
+        save, load, sanity checking, frequency induction, data checking, scanning, etc.
+
+        Parameters
+        ----------
+        model : :class:`~gensim.models.keyedvectors.KeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
+            This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings
+            the wv and wv.vector elements are required.
+        sv_mapfile_path : str, optional
+            Optional path to store the sentence-vectors in for very large datasets. Used for memmap.
+        wv_mapfile_path : str, optional
+            Optional path to store the word-vectors in for very large datasets. Used for memmap.
+            Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram.
+        workers : int, optional
+            Number of working threads, used for multithreading. For most tasks (few words in a sentence)
+            a value of 1 should be more than enough.
+        lang_freq : str, optional
+            Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about
+            the frequency of a word. As the frequency is required for estimating the word weights, we induce
+            frequencies into the wv based on :class:`~wordfreq`
+            If no frequency information is available, you can choose the language to estimate the frequency.
+            See https://github.com/LuminosoInsight/wordfreq
+        fast_version : {-1, 1}, optional
+            Whether or not the fast cython implementation of the internal training methods is available. 1 means it is.
+        batch_words : int, optional
+            Number of words to be processed by a single job.
+        batch_ngrams : int, optional
+            Number of maxium ngrams for oov words.
+        **kwargs : object
+            Key word arguments needed to allow children classes to accept more arguments.
+
+        """
+        set_madvise_for_mmap()
+
+        self.workers = int(workers)
+        self.batch_words = batch_words
+        self.batch_ngrams = batch_ngrams
+        self.wv = None
+
+        self.is_ft = False
+
+        self.wv_mapfile_path = (
+            Path(wv_mapfile_path) if wv_mapfile_path is not None else None
+        )
+        self.wv_mapfile_shapes = {}
+
+        if fast_version < 0:
+            warnings.warn(
+                "C extension not loaded, training/inferring will be slow. "
+                "Install a C compiler and reinstall fse."
+            )
+
+        self._check_and_include_model(model)
+
+        if self.wv_mapfile_path is not None:
+            self._map_all_vectors_to_disk(self.wv_mapfile_path)
+
+        if lang_freq is not None:
+            self._check_language_settings(lang_freq)
+            self._induce_frequencies()
+
+        self.sv = SentenceVectors(
+            vector_size=self.wv.vector_size, mapfile_path=sv_mapfile_path
+        )
+        self.prep = BaseSentence2VecPreparer()
+
+        self.word_weights = ones(len(self.wv), REAL)
+
+    def __str__(self) -> str:
+        """Human readable representation of the model's state.
+
+        Returns
+        -------
+        str
+            Human readable representation of the model's state.
+
+        """
+        return f"{self.__class__.__name__} based on {self.wv.__class__.__name__}, vector_size={len(self.sv)}"
+
+    def _check_and_include_model(self, model: KeyedVectors):
+        """Check if the supplied model is a compatible model. Performs all kinds of checks and small optimizations.
+
+        Parameters
+        ----------
+        model : :class:`~gensim.models.keyedvectors.KeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
+            The model to inject into this class.
+
+        """
+        if isinstance(model, (Word2Vec, FastText)):
+            if not hasattr(model, "wv"):
+                raise RuntimeError("Model does not contain wv object.")
+            self.wv = model.wv
+        elif isinstance(model, KeyedVectors):
+            self.wv = model
+        else:
+            raise RuntimeError(
+                f"Model must be child of BaseWordEmbeddingsModel or KeyedVectors. Received {str(model)}"
+            )
+        self.wv.vectors_norm = None
+
+        if isinstance(self.wv, FastTextKeyedVectors):
+            self.wv.vectors_vocab_norm = None  # Save some space
+            self.wv.vectors_ngrams_norm = None
+            self.wv.vectors_vocab_norm = None
+            self.is_ft = True
+
+            if not self.wv.compatible_hash:
+                raise RuntimeError("FastText model requires compatible hash function")
+            if not hasattr(self.wv, "vectors_vocab") or self.wv.vectors_vocab is None:
+                raise RuntimeError(
+                    "vectors_vocab required for sentence embeddings not found."
+                )
+            if not hasattr(self.wv, "vectors_ngrams") or self.wv.vectors_ngrams is None:
+                raise RuntimeError(
+                    "Ngram vectors required for sentence embeddings not found."
+                )
+
+        if not hasattr(self.wv, "vectors") or self.wv.vectors is None:
+            raise RuntimeError(
+                "Word vectors required for sentence embeddings not found."
+            )
+
+    def _check_language_settings(self, lang_freq: str):
+        """Check if the supplied language is a compatible with the wordfreq package
+
+        Parameters
+        ----------
+        lang_freq : str
+            The language used to induce the frequencies into the wv object.
+
+        """
+        if lang_freq in available_languages(wordlist="best"):
+            self.lang_freq = str(lang_freq)
+            logger.info(
+                "no frequency mode: using wordfreq for estimation "
+                f"of frequency for language: {self.lang_freq}"
+            )
+        else:
+            raise ValueError(f"Language {lang_freq} is not available in wordfreq")
+
+    def _induce_frequencies(self, domain: int = 2 ** 31 - 1):
+        """Induce frequencies for a pretrained model, as not all pretrained models come with frequencies.
+
+        Parameters
+        ----------
+        domain : int
+            The cumulative count of the vocabulary.
+
+        """
+        freq_dict = get_frequency_dict(self.lang_freq, wordlist="best")
+        for word in self.wv.index_to_key:
+            if word in freq_dict:
+                self.wv.set_vecattr(word, "count", int(freq_dict[word] * domain))
+            else:
+                self.wv.set_vecattr(word, "count", int(1e-8 * domain))
+
+    def _check_input_data_sanity(self, data_iterable: tuple):
+        """Check if the input data complies with the required formats
+
+        Parameters
+        ----------
+        data_iterable : tuple
+            The cumulative count of the vocabulary.
+
+        """
+        if data_iterable is None:
+            raise TypeError("You must provide a data iterable to train on")
+        elif isinstance(data_iterable, str):
+            raise TypeError(
+                "Passed string. Input data must be iterable list of list of tokens or tuple"
+            )
+        elif not hasattr(data_iterable, "__iter__"):
+            raise TypeError("Iterable must provide __iter__ function")
+
+    def _log_train_end(self, eff_sentences: int, eff_words: int, overall_time: float):
+        """Log the end of training.
+
+        Parameters
+        ----------
+        eff_sentences : int
+            Number of effective (non-zero) sentences encountered in training.
+        eff_words : int
+            Number of effective words used in training (after ignoring unknown words).
+        overall_time : float
+            Time in seconds for the task to be completed.
+
+        """
+        logger.info(
+            f"training on {eff_sentences} effective sentences with {eff_words} effective words "
+            f"took {int(overall_time)}s with {int(eff_sentences / overall_time)} sentences/s"
+        )
+
+    def _check_pre_training_sanity(
+        self, total_sentences: int, total_words: int, average_length: int, **kwargs
+    ):
+        """Check if all available objects for training are available and compliant
+
+        Parameters
+        ----------
+        total_sentences : int
+            Number of sentences encountered while scanning
+        total_words : int
+            Number of words encountered while scanning
+        average_length : int
+            Average sentence length
+
+        """
+        if not hasattr(self, "wv") or self.wv is None:
+            raise RuntimeError("you must first load a valid KeyedVectors object")
+        if not len(self.wv.vectors):
+            raise RuntimeError(
+                "you must initialize vectors before computing sentence vectors"
+            )
+
+        if self.is_ft and not len(self.wv.vectors_ngrams):
+            raise RuntimeError(
+                "you must initialize ngram vectors before computing sentence vectors"
+            )
+        if self.is_ft and not len(self.wv.vectors_vocab):
+            raise RuntimeError(
+                "you must initialize vectors_vocab before computing sentence vectors"
+            )
+
+        if sum([self.wv.get_vecattr(w, "count") for w in self.wv.key_to_index]) == len(
+            self.wv
+        ):
+            logger.warning(
+                "The sum of the word counts is equal to its length (all word counts are 1). "
+                "Make sure to obtain proper word counts by using lang_freq for pretrained embeddings."
+            )
+
+        if not hasattr(self.sv, "vectors") or self.sv.vectors is None:
+            raise RuntimeError("initialization of Sentencevectors failed")
+        if not hasattr(self, "word_weights") or self.word_weights is None:
+            raise RuntimeError("initialization of word weights failed")
+
+        if not len(self.wv.vectors) == len(self.word_weights):
+            raise RuntimeError("Number of word vectors and weights does not match")
+
+        if self.wv.vectors.dtype != REAL:
+            raise TypeError(f"type of wv.vectors is wrong: {self.wv.vectors.dtype}")
+        if self.is_ft and self.wv.vectors_ngrams.dtype != REAL:
+            raise TypeError(
+                f"type of wv.vectors_ngrams is wrong: {self.wv.vectors_ngrams.dtype}"
+            )
+        if self.is_ft and self.wv.vectors_vocab.dtype != REAL:
+            raise TypeError(
+                f"type of wv.vectors_vocab is wrong: {self.wv.vectors_vocab.dtype}"
+            )
+        if self.sv.vectors.dtype != REAL:
+            raise TypeError(f"type of sv.vectors is wrong: {self.sv.vectors.dtype}")
+        if self.word_weights.dtype != REAL:
+            raise TypeError(f"type of word_weights is wrong: {self.word_weights.dtype}")
+
+        if total_sentences == 0 or total_words == 0 or average_length == 0:
+            raise ValueError(
+                f"scanning the sentences returned invalid values. Check the input."
+            )
+
+    def _check_post_training_sanity(self, eff_sentences: int, eff_words: int):
+        """Check if the training results make sense
+
+        Parameters
+        ----------
+        eff_sentences : int
+            Number of effective sentences encountered during training
+        eff_words : int
+            Number of effective words encountered during training
+
+        """
+        if eff_sentences == 0 or eff_words == 0:
+            raise ValueError(f"training returned invalid values. Check the input.")
+
+    def _check_indexed_sent_valid(
+        self, iterPos: int, obj: tuple, checked: int = False
+    ) -> Tuple[int, List[str]]:
+        """Performs a check if the passed object contains valid data
+
+        Parameters
+        ----------
+        iterPos : int
+            Position in file/iterable
+        obj : tuple
+            An tuple object containing the index and sentence
+
+        Returns
+        -------
+        int
+            Index of the sentence used to write to (in sv.vectors)
+        list
+            List of strings containing all words in a sentence
+
+        """
+
+        if isinstance(obj, tuple):
+            sent = obj[0]  # Faster than obj.words
+            index = obj[1]
+        else:
+            raise TypeError(f"Passed {type(obj)}: {obj}. Iterable must contain tuple.")
+
+        if not checked:
+            if not isinstance(sent, list) or not all(isinstance(w, str) for w in sent):
+                raise TypeError(
+                    f"At {iterPos}: Passed {type(sent)}: {sent}. tuple.words must contain list of str."
+                )
+            if not isinstance(index, int):
+                raise TypeError(
+                    f"At {iterPos}: Passed {type(index)}: {index}. tuple.index must contain index"
+                )
+            if index < 0:
+                raise ValueError(f"At {iterPos}: Passed negative {index}")
+        return index, sent
+
+    def _map_all_vectors_to_disk(self, mapfile_path: Path):
+        """Maps all vectors to disk
+
+        Parameters
+        ----------
+        mapfile_path : Path
+            Path where to write the vectors to
+
+        """
+        path = str(mapfile_path.absolute())
+
+        self.wv_mapfile_shapes["vectors"] = self.wv.vectors.shape
+        self.wv.vectors = self._move_ndarray_to_disk(
+            self.wv.vectors, mapfile_path=path, name="wv"
+        )
+        if self.is_ft:
+            self.wv_mapfile_shapes["vectors_vocab"] = self.wv.vectors_vocab.shape
+            self.wv_mapfile_shapes["vectors_ngrams"] = self.wv.vectors_ngrams.shape
+            self.wv.vectors_vocab = self._move_ndarray_to_disk(
+                self.wv.vectors_vocab, mapfile_path=self.wv_mapfile_path, name="vocab"
+            )
+            self.wv.vectors_ngrams = self._move_ndarray_to_disk(
+                self.wv.vectors_ngrams, mapfile_path=self.wv_mapfile_path, name="ngrams"
+            )
+
+    def _load_all_vectors_from_disk(self, mapfile_path: Path):
+        """Reads all vectors from disk
+
+        Parameters
+        ----------
+        mapfile_path : Path
+            Path where to read the vectors from
+
+        """
+        path = str(mapfile_path.absolute())
+
+        self.wv.vectors = np_memmap(
+            f"{path}_wv.vectors",
+            dtype=REAL,
+            mode="r",
+            shape=self.wv_mapfile_shapes["vectors"],
+        )
+        if self.is_ft:
+            self.wv.vectors_vocab = np_memmap(
+                f"{path}_vocab.vectors",
+                dtype=REAL,
+                mode="r",
+                shape=self.wv_mapfile_shapes["vectors_vocab"],
+            )
+            self.wv.vectors_ngrams = np_memmap(
+                f"{path}_ngrams.vectors",
+                dtype=REAL,
+                mode="r",
+                shape=self.wv_mapfile_shapes["vectors_ngrams"],
+            )
+
+    def _move_ndarray_to_disk(
+        self, vector: ndarray, mapfile_path: str, name: str = ""
+    ) -> ndarray:
+        """Moves a numpy ndarray to disk via memmap
+
+        Parameters
+        ----------
+        vector : ndarray
+            The vector to write to disk
+        mapfile_path : Path
+            Path where to write the vector to
+        name : str
+            Suffix which is appended to the path to distinguish multiple files
+
+        Returns
+        -------
+        ndarray
+            readonly ndarray to be used in further computations
+
+        """
+        shape = vector.shape
+        path = Path(f"{mapfile_path}_{name}.vectors")
+
+        if not path.exists():
+            logger.info(f"writing {name} to {path}")
+            memvecs = np_memmap(path, dtype=REAL, mode="w+", shape=shape)
+            memvecs[:] = vector[:]
+            del memvecs, vector
+        else:
+            # If multiple instances of this class exist, all can access the same files
+            logger.info(f"loading pre-existing {name} from {path}")
+
+        readonly_memvecs = np_memmap(path, dtype=REAL, mode="r", shape=shape)
+        return readonly_memvecs
+
+    def _get_thread_working_mem(self) -> Tuple[ndarray, ndarray]:
+        """Computes the memory used per worker thread.
+
+        Returns
+        -------
+        np.ndarray
+            Each worker threads private work memory.
+
+        """
+        mem = zeros_aligned(self.sv.vector_size, dtype=REAL)
+        oov_mem = zeros_aligned((self.batch_words, self.batch_ngrams), dtype=uINT)
+        return (mem, oov_mem)
+
+    def _do_train_job(
+        self, data_iterable: List[tuple], target: ndarray, memory: ndarray
+    ) -> Tuple[int, int]:
+        """ Function to be called on a batch of sentences. Returns eff sentences/words """
+        raise NotImplementedError()
+
+    def _pre_train_calls(self, **kwargs):
+        """ Function calls to perform before training """
+        raise NotImplementedError()
+
+    def _post_train_calls(self, **kwargs):
+        """ Function calls to perform after training, such as computing eigenvectors """
+        raise NotImplementedError()
+
+    def _post_inference_calls(self, **kwargs):
+        """Function calls to perform after training & inference
+        Examples include the removal of components
+        """
+        raise NotImplementedError()
+
+    def _check_parameter_sanity(self, **kwargs):
+        """ Check the sanity of all child paramters """
+        raise NotImplementedError()
+
+    def _check_dtype_santiy(self, **kwargs):
+        """ Check the dtypes of all child attributes """
+        raise NotImplementedError()
+
+    @classmethod
+    def load(cls, *args, **kwargs):
+        """Load a previously saved :class:`~fse.models.base_s2v.BaseSentence2VecModel`.
+
+        Parameters
+        ----------
+        fname : str
+            Path to the saved file.
+
+        Returns
+        -------
+        :class:`~fse.models.base_s2v.BaseSentence2VecModel`
+            Loaded model.
+
+        """
+        # This is kind of an ugly hack because I cannot directly modify the save routine of the
+        # correpsonding KeyedVectors Files, as a memmap file makes the npy files irrelvant
+        model = super(BaseSentence2VecModel, cls).load(*args, **kwargs)
+
+        if model.wv_mapfile_path is not None:
+            model._load_all_vectors_from_disk(model.wv_mapfile_path)
+        model.wv_mapfile_shapes = None
+
+        set_madvise_for_mmap()
+
+        return model
+
+    def save(self, *args, **kwargs):
+        """Save the model.
+        This saved model can be loaded again using :func:`~fse.models.base_s2v.BaseSentence2VecModel.load`
+
+        Parameters
+        ----------
+        fname : str
+            Path to the file.
+
+        """
+        # Manually removes vectors from the wv class because we cannot modify the save method
+        if self.wv_mapfile_path is not None:
+            self.wv.vectors = None
+        super(BaseSentence2VecModel, self).save(*args, **kwargs)
+
+    def scan_sentences(
+        self, sentences: List[tuple] = None, progress_per: int = 5
+    ) -> Dict[str, int]:
+        """Performs an initial scan of the data and reports all corresponding statistics
+
+        Parameters
+        ----------
+        sentences : (list, iterable)
+            An iterable consisting of tuple objects
+        progress_per : int
+            Number of seconds to pass before reporting the scan progress
+
+        Returns
+        -------
+        dict
+            Dictionary containing the scan statistics
+
+        """
+        logger.info("scanning all indexed sentences and their word counts")
+
+        current_time = time()
+        total_sentences = 0
+        total_words = 0
+        average_length = 0
+        empty_sentences = 0
+        max_index = 0
+        checked_sentences = (
+            0  # We only check the first item to not constrain runtime so much
+        )
+
+        for i, obj in enumerate(sentences):
+            index, sent = self._check_indexed_sent_valid(
+                iterPos=i, obj=obj, checked=checked_sentences
+            )
+            checked_sentences += 1
+            if time() - current_time > progress_per:
+                current_time = time()
+                logger.info(
+                    f"SCANNING : finished {total_sentences} sentences with {total_words} words"
+                )
+
+            max_index = max(max_index, index)
+            total_sentences += 1
+            total_words += len(sent)
+
+            if not len(sent):
+                empty_sentences += 1
+
+        if empty_sentences:
+            logger.warning(f"found {empty_sentences} empty sentences")
+
+        if max_index >= total_sentences:
+            raise RuntimeError(
+                f"Index {max_index} is larger than number of sentences {total_sentences}"
+            )
+
+        average_length = int(total_words / total_sentences)
+
+        logger.info(
+            f"finished scanning {total_sentences} sentences with an average length of {average_length} and {total_words} total words"
+        )
+        statistics = {
+            "total_sentences": total_sentences,
+            "total_words": total_words,
+            "average_length": average_length,
+            "empty_sentences": empty_sentences,
+            "max_index": max_index + 1,
+        }
+        return statistics
+
+    def estimate_memory(
+        self, max_index: int, report: dict = None, **kwargs
+    ) -> Dict[str, int]:
+        """Estimate the size of the sentence embedding
+
+        Parameters
+        ----------
+        max_index : int
+            Maximum index found during the initial scan
+        report : dict
+            Report of subclasses
+
+        Returns
+        -------
+        dict
+            Dictionary of estimated memory sizes
+
+        """
+        vocab_size = len(self.wv.vectors)
+
+        report = report or {}
+        report["Word Weights"] = vocab_size * dtype(REAL).itemsize
+        report["Word Vectors"] = vocab_size * self.wv.vector_size * dtype(REAL).itemsize
+        report["Sentence Vectors"] = (
+            max_index * self.wv.vector_size * dtype(REAL).itemsize
+        )
+        if self.is_ft:
+            report["Vocab Vectors"] = (
+                vocab_size * self.wv.vector_size * dtype(REAL).itemsize
+            )
+            report["Ngram Vectors"] = (
+                self.wv.vectors_ngrams.shape[0]
+                * self.wv.vector_size
+                * dtype(REAL).itemsize
+            )
+        report["Total"] = sum(report.values())
+        mb_size = int(report["Total"] / 1024 ** 2)
+        logger.info(
+            f"estimated memory for {max_index} sentences with "
+            f"{self.wv.vector_size} dimensions and {vocab_size} vocabulary: "
+            f"{mb_size} MB ({int(mb_size / 1024)} GB)"
+        )
+        if report["Total"] >= 0.95 * virtual_memory()[1]:
+            logger.warning(
+                "The embeddings will likely not fit into RAM. Consider to use mapfile_path"
+            )
+        return report
+
+    def train(
+        self,
+        sentences: List[tuple] = None,
+        update: bool = False,
+        queue_factor: int = 2,
+        report_delay: int = 5,
+    ) -> Tuple[int, int]:
+        """Main routine to train an embedding. This method writes all sentences vectors into sv.vectors and is
+        used for computing embeddings for large chunks of data. This method also handles post-training transformations,
+        such as computing the SVD of the sentence vectors.
+
+        Parameters
+        ----------
+        sentences : (list, iterable)
+            An iterable consisting of tuple objects
+        update : bool
+            If bool is True, the sentence vector matrix will be updated in size (even with memmap)
+        queue_factor : int
+            Multiplier for size of queue -> size = number of workers * queue_factor.
+        report_delay : int
+            Number of seconds between two consecutive progress report messages in the logger.
+
+        Returns
+        -------
+        int, int
+            Count of effective sentences and words encountered
+
+        """
+        self._check_input_data_sanity(sentences)
+        statistics = self.scan_sentences(sentences)
+
+        self._check_pre_training_sanity(**statistics)
+
+        self.estimate_memory(**statistics)
+        self.prep.prepare_vectors(
+            sv=self.sv, total_sentences=statistics["max_index"], update=update
+        )
+
+        # Preform post-tain calls (i.e weight computation)
+        self._pre_train_calls(**statistics)
+        self._check_parameter_sanity()
+        self._check_dtype_santiy()
+        start_time = time()
+
+        logger.info(f"begin training")
+
+        _, eff_sentences, eff_words = self._train_manager(
+            data_iterable=sentences,
+            total_sentences=statistics["total_sentences"],
+            queue_factor=queue_factor,
+            report_delay=report_delay,
+        )
+
+        overall_time = time() - start_time
+
+        self._check_post_training_sanity(
+            eff_sentences=eff_sentences, eff_words=eff_words
+        )
+
+        # Preform post-tain calls (i.e principal component removal)
+        self._post_train_calls()
+
+        self._log_train_end(
+            eff_sentences=eff_sentences, eff_words=eff_words, overall_time=overall_time
+        )
+
+        return eff_sentences, eff_words
+
+    def infer(self, sentences: List[tuple] = None, use_norm=False) -> ndarray:
+        """Secondary routine to train an embedding. This method is essential for small batches of sentences,
+        which require little computation. Note: This method does not apply post-training transformations,
+        only post inference calls (such as removing principal components).
+
+        Parameters
+        ----------
+        sentences : (list, iterable)
+            An iterable consisting of tuple objects
+        use_norm : bool
+            If bool is True, the sentence vectors will be L2 normalized (unit euclidean length)
+
+        Returns
+        -------
+        ndarray
+            Computed sentence vectors
+
+        """
+        self._check_input_data_sanity(sentences)
+
+        statistics = self.scan_sentences(sentences)
+
+        output = zeros((statistics["max_index"], self.sv.vector_size), dtype=REAL)
+        mem = self._get_thread_working_mem()
+
+        job_batch, batch_size = [], 0
+        for data_idx, data in enumerate(sentences):
+            data_length = len(data[0])
+            if batch_size + data_length <= self.batch_words:
+                job_batch.append(data)
+                batch_size += data_length
+            else:
+                self._do_train_job(data_iterable=job_batch, target=output, memory=mem)
+                job_batch, batch_size = [data], data_length
+        if job_batch:
+            self._do_train_job(data_iterable=job_batch, target=output, memory=mem)
+
+        self._post_inference_calls(output=output)
+
+        if use_norm:
+            output = _l2_norm(output)
+        return output
+
+    def _train_manager(
+        self,
+        data_iterable: List[tuple],
+        total_sentences: int = None,
+        queue_factor: int = 2,
+        report_delay: int = 5,
+    ):
+        """Manager for the multi-core implementation. Directly adapted from gensim
+
+        Parameters
+        ----------
+        data_iterable : (list, iterable)
+            An iterable consisting of tuple objects. This will be split in chunks and these chunks will be pushed to the queue.
+        total_sentences : int
+            Number of sentences found during the initial scan
+        queue_factor : int
+            Multiplier for size of queue -> size = number of workers * queue_factor.
+        report_delay : int
+            Number of seconds between two consecutive progress report messages in the logger.
+
+        """
+        job_queue = Queue(maxsize=queue_factor * self.workers)
+        progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)
+
+        # WORKING Threads
+        workers = [
+            threading.Thread(target=self._worker_loop, args=(job_queue, progress_queue))
+            for _ in range(self.workers)
+        ]
+        # JOB PRODUCER
+        workers.append(
+            threading.Thread(target=self._job_producer, args=(data_iterable, job_queue))
+        )
+
+        for thread in workers:
+            thread.daemon = True  # make interrupting the process with ctrl+c easier
+            thread.start()
+
+        jobs, eff_sentences, eff_words = self._log_train_progress(
+            progress_queue, total_sentences=total_sentences, report_delay=report_delay
+        )
+        return jobs, eff_sentences, eff_words
+
+    def _worker_loop(self, job_queue, progress_queue):
+        """Train the model, lifting batches of data from the queue.
+
+        This function will be called in parallel by multiple workers (threads or processes) to make
+        optimal use of multicore machines.
+
+        Parameters
+        ----------
+        job_queue : Queue of (list of tuple)
+            A queue of jobs still to be processed. The worker will take up jobs from this queue.
+            Each job is represented as a batch of tuple.
+        progress_queue : Queue of (int, int, int)
+            A queue of progress reports. Each report is represented as a tuple of these 3 elements:
+                * Size of job processed
+                * Effective sentences encountered in traning
+                * Effective words encountered in traning
+
+        """
+        mem = self._get_thread_working_mem()
+        jobs_processed = 0
+        while True:
+            job = job_queue.get()
+            if job is None:
+                progress_queue.put(None)
+                # no more jobs => quit this worker
+                break
+            eff_sentences, eff_words = self._do_train_job(
+                data_iterable=job, target=self.sv.vectors, memory=mem
+            )
+            progress_queue.put((len(job), eff_sentences, eff_words))
+            jobs_processed += 1
+        logger.debug(f"worker exiting, processed {jobs_processed} jobs")
+
+    def _job_producer(self, data_iterable: List[tuple], job_queue: Queue):
+        """Fill the jobs queue using the data found in the input stream.
+
+        Each job is represented as a batch of tuple
+
+        Parameters
+        ----------
+        data_iterable : (list, iterable)
+            An iterable consisting of tuple objects. This will be split in chunks and these chunks will be pushed to the queue.
+        job_queue : Queue of (list of tuple)
+            A queue of jobs still to be processed. The worker will take up jobs from this queue.
+            Each job is represented as a batch of tuple.
+
+        """
+
+        job_batch, batch_size = [], 0
+        job_no = 0
+
+        for data_idx, data in enumerate(data_iterable):
+            data_length = len(data[0])
+            if batch_size + data_length <= self.batch_words:
+                job_batch.append(data)
+                batch_size += data_length
+            else:
+                job_no += 1
+                job_queue.put(job_batch)
+                job_batch, batch_size = [data], data_length
+
+        if job_batch:
+            job_no += 1
+            job_queue.put(job_batch)
+
+        for _ in range(self.workers):
+            job_queue.put(None)
+        logger.debug(f"job loop exiting, total {job_no} jobs")
+
+    def _log_train_progress(
+        self, progress_queue: Queue, total_sentences: int = None, report_delay: int = 5
+    ):
+        """Log the training process after a couple of seconds.
+
+        Parameters
+        ----------
+        progress_queue : Queue of (int, int, int)
+            A queue of progress reports. Each report is represented as a tuple of these 3 elements:
+                * Size of job processed
+                * Effective sentences encountered in traning
+                * Effective words encountered in traning
+        total_sentences : int
+            Number of sentences found during the initial scan
+        report_delay : int
+            Number of seconds between two consecutive progress report messages in the logger.
+
+        Returns
+        -------
+        int, int, int
+            number of jobs, effective sentences, and effective words in traning
+
+        """
+        jobs, eff_sentences, eff_words = 0, 0, 0
+        unfinished_worker_count = self.workers
+        start_time = time()
+        sentence_inc = 0
+        while unfinished_worker_count > 0:
+            report = progress_queue.get()
+            if report is None:  # a thread reporting that it finished
+                unfinished_worker_count -= 1
+                logger.info(
+                    f"worker thread finished; awaiting finish of {unfinished_worker_count} more threads"
+                )
+                continue
+
+            j, s, w = report
+            jobs += j
+            eff_sentences += s
+            eff_words += w
+            if time() - start_time >= report_delay:
+                start_time = time()
+
+                logger.info(
+                    "PROGRESS : finished {:3.2f}% with {} sentences and {} words, {} sentences/s".format(
+                        100 * (eff_sentences / total_sentences),
+                        eff_sentences,
+                        eff_words,
+                        int((eff_sentences - sentence_inc) / report_delay),
+                    )
+                )
+                sentence_inc = eff_sentences
+
+        return jobs, eff_sentences, eff_words
+
+
+class BaseSentence2VecPreparer(SaveLoad):
+    """ Contains helper functions to perpare the weights for the training of BaseSentence2VecModel """
+
+    def prepare_vectors(
+        self, sv: SentenceVectors, total_sentences: int, update: bool = False
+    ):
+        """Build tables and model weights based on final vocabulary settings."""
+        if not update:
+            self.reset_vectors(sv, total_sentences)
+        else:
+            self.update_vectors(sv, total_sentences)
+
+    def reset_vectors(self, sv: SentenceVectors, total_sentences: int):
+        """Initialize all sentence vectors to zero and overwrite existing files"""
+        logger.info(f"initializing sentence vectors for {total_sentences} sentences")
+        if sv.mapfile_path:
+            sv.vectors = np_memmap(
+                str(sv.mapfile_path) + ".vectors",
+                dtype=REAL,
+                mode="w+",
+                shape=(total_sentences, sv.vector_size),
+            )
+        else:
+            sv.vectors = empty((total_sentences, sv.vector_size), dtype=REAL)
+
+        for i in range(total_sentences):
+            sv.vectors[i] = full(shape=sv.vector_size, fill_value=EPS, dtype=REAL)
+        sv.vectors_norm = None
+
+    def update_vectors(self, sv: SentenceVectors, total_sentences: int):
+        """Given existing sentence vectors, append new ones"""
+        logger.info(f"appending sentence vectors for {total_sentences} sentences")
+        sentences_before = len(sv.vectors)
+        sentences_after = len(sv.vectors) + total_sentences
+
+        if sv.mapfile_path:
+            sv.vectors = np_memmap(
+                str(sv.mapfile_path) + ".vectors",
+                dtype=REAL,
+                mode="r+",
+                shape=(sentences_after, sv.vector_size),
+            )
+            for i in range(sentences_before, sentences_after):
+                sv.vectors[i] = full(shape=sv.vector_size, fill_value=EPS, dtype=REAL)
+        else:
+            newvectors = empty((total_sentences, sv.vector_size), dtype=REAL)
+            for i in range(total_sentences):
+                newvectors[i] = full(shape=sv.vector_size, fill_value=EPS, dtype=REAL)
+            sv.vectors = vstack([sv.vectors, newvectors])
+        sv.vectors_norm = None
+
+
+
+
+
+
+
+
+
+

Classes

+
+
+class BaseSentence2VecModel +(model: gensim.models.keyedvectors.KeyedVectors, sv_mapfile_path: str = None, wv_mapfile_path: str = None, workers: int = 1, lang_freq: str = None, fast_version: int = 0, batch_words: int = 10000, batch_ngrams: int = 40, **kwargs) +
+
+

Serialize/deserialize objects from disk, by equipping them with the save() / load() methods.

+

Warnings

+

This uses pickle internally (among other techniques), so objects must not contain unpicklable attributes +such as lambda functions etc.

+

Base class for all Sentence2Vec Models. Provides core functionality, such as +save, load, sanity checking, frequency induction, data checking, scanning, etc.

+

Parameters

+
+
model : :class:~gensim.models.keyedvectors.KeyedVectorsor `:class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel
+
This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings +the wv and wv.vector elements are required.
+
sv_mapfile_path : str, optional
+
Optional path to store the sentence-vectors in for very large datasets. Used for memmap.
+
wv_mapfile_path : str, optional
+
Optional path to store the word-vectors in for very large datasets. Used for memmap. +Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram.
+
workers : int, optional
+
Number of working threads, used for multithreading. For most tasks (few words in a sentence) +a value of 1 should be more than enough.
+
lang_freq : str, optional
+
Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about +the frequency of a word. As the frequency is required for estimating the word weights, we induce +frequencies into the wv based on :class:~wordfreq +If no frequency information is available, you can choose the language to estimate the frequency. +See https://github.com/LuminosoInsight/wordfreq
+
fast_version : {-1, 1}, optional
+
Whether or not the fast cython implementation of the internal training methods is available. 1 means it is.
+
batch_words : int, optional
+
Number of words to be processed by a single job.
+
batch_ngrams : int, optional
+
Number of maxium ngrams for oov words.
+
**kwargs : object
+
Key word arguments needed to allow children classes to accept more arguments.
+
+
+ +Expand source code + +
class BaseSentence2VecModel(SaveLoad):
+    def __init__(
+        self,
+        model: KeyedVectors,
+        sv_mapfile_path: str = None,
+        wv_mapfile_path: str = None,
+        workers: int = 1,
+        lang_freq: str = None,
+        fast_version: int = 0,
+        batch_words: int = 10000,
+        batch_ngrams: int = 40,
+        **kwargs,
+    ):
+        """Base class for all Sentence2Vec Models. Provides core functionality, such as
+        save, load, sanity checking, frequency induction, data checking, scanning, etc.
+
+        Parameters
+        ----------
+        model : :class:`~gensim.models.keyedvectors.KeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
+            This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings
+            the wv and wv.vector elements are required.
+        sv_mapfile_path : str, optional
+            Optional path to store the sentence-vectors in for very large datasets. Used for memmap.
+        wv_mapfile_path : str, optional
+            Optional path to store the word-vectors in for very large datasets. Used for memmap.
+            Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram.
+        workers : int, optional
+            Number of working threads, used for multithreading. For most tasks (few words in a sentence)
+            a value of 1 should be more than enough.
+        lang_freq : str, optional
+            Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about
+            the frequency of a word. As the frequency is required for estimating the word weights, we induce
+            frequencies into the wv based on :class:`~wordfreq`
+            If no frequency information is available, you can choose the language to estimate the frequency.
+            See https://github.com/LuminosoInsight/wordfreq
+        fast_version : {-1, 1}, optional
+            Whether or not the fast cython implementation of the internal training methods is available. 1 means it is.
+        batch_words : int, optional
+            Number of words to be processed by a single job.
+        batch_ngrams : int, optional
+            Number of maxium ngrams for oov words.
+        **kwargs : object
+            Key word arguments needed to allow children classes to accept more arguments.
+
+        """
+        set_madvise_for_mmap()
+
+        self.workers = int(workers)
+        self.batch_words = batch_words
+        self.batch_ngrams = batch_ngrams
+        self.wv = None
+
+        self.is_ft = False
+
+        self.wv_mapfile_path = (
+            Path(wv_mapfile_path) if wv_mapfile_path is not None else None
+        )
+        self.wv_mapfile_shapes = {}
+
+        if fast_version < 0:
+            warnings.warn(
+                "C extension not loaded, training/inferring will be slow. "
+                "Install a C compiler and reinstall fse."
+            )
+
+        self._check_and_include_model(model)
+
+        if self.wv_mapfile_path is not None:
+            self._map_all_vectors_to_disk(self.wv_mapfile_path)
+
+        if lang_freq is not None:
+            self._check_language_settings(lang_freq)
+            self._induce_frequencies()
+
+        self.sv = SentenceVectors(
+            vector_size=self.wv.vector_size, mapfile_path=sv_mapfile_path
+        )
+        self.prep = BaseSentence2VecPreparer()
+
+        self.word_weights = ones(len(self.wv), REAL)
+
+    def __str__(self) -> str:
+        """Human readable representation of the model's state.
+
+        Returns
+        -------
+        str
+            Human readable representation of the model's state.
+
+        """
+        return f"{self.__class__.__name__} based on {self.wv.__class__.__name__}, vector_size={len(self.sv)}"
+
+    def _check_and_include_model(self, model: KeyedVectors):
+        """Check if the supplied model is a compatible model. Performs all kinds of checks and small optimizations.
+
+        Parameters
+        ----------
+        model : :class:`~gensim.models.keyedvectors.KeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
+            The model to inject into this class.
+
+        """
+        if isinstance(model, (Word2Vec, FastText)):
+            if not hasattr(model, "wv"):
+                raise RuntimeError("Model does not contain wv object.")
+            self.wv = model.wv
+        elif isinstance(model, KeyedVectors):
+            self.wv = model
+        else:
+            raise RuntimeError(
+                f"Model must be child of BaseWordEmbeddingsModel or KeyedVectors. Received {str(model)}"
+            )
+        self.wv.vectors_norm = None
+
+        if isinstance(self.wv, FastTextKeyedVectors):
+            self.wv.vectors_vocab_norm = None  # Save some space
+            self.wv.vectors_ngrams_norm = None
+            self.wv.vectors_vocab_norm = None
+            self.is_ft = True
+
+            if not self.wv.compatible_hash:
+                raise RuntimeError("FastText model requires compatible hash function")
+            if not hasattr(self.wv, "vectors_vocab") or self.wv.vectors_vocab is None:
+                raise RuntimeError(
+                    "vectors_vocab required for sentence embeddings not found."
+                )
+            if not hasattr(self.wv, "vectors_ngrams") or self.wv.vectors_ngrams is None:
+                raise RuntimeError(
+                    "Ngram vectors required for sentence embeddings not found."
+                )
+
+        if not hasattr(self.wv, "vectors") or self.wv.vectors is None:
+            raise RuntimeError(
+                "Word vectors required for sentence embeddings not found."
+            )
+
+    def _check_language_settings(self, lang_freq: str):
+        """Check if the supplied language is a compatible with the wordfreq package
+
+        Parameters
+        ----------
+        lang_freq : str
+            The language used to induce the frequencies into the wv object.
+
+        """
+        if lang_freq in available_languages(wordlist="best"):
+            self.lang_freq = str(lang_freq)
+            logger.info(
+                "no frequency mode: using wordfreq for estimation "
+                f"of frequency for language: {self.lang_freq}"
+            )
+        else:
+            raise ValueError(f"Language {lang_freq} is not available in wordfreq")
+
+    def _induce_frequencies(self, domain: int = 2 ** 31 - 1):
+        """Induce frequencies for a pretrained model, as not all pretrained models come with frequencies.
+
+        Parameters
+        ----------
+        domain : int
+            The cumulative count of the vocabulary.
+
+        """
+        freq_dict = get_frequency_dict(self.lang_freq, wordlist="best")
+        for word in self.wv.index_to_key:
+            if word in freq_dict:
+                self.wv.set_vecattr(word, "count", int(freq_dict[word] * domain))
+            else:
+                self.wv.set_vecattr(word, "count", int(1e-8 * domain))
+
+    def _check_input_data_sanity(self, data_iterable: tuple):
+        """Check if the input data complies with the required formats
+
+        Parameters
+        ----------
+        data_iterable : tuple
+            The cumulative count of the vocabulary.
+
+        """
+        if data_iterable is None:
+            raise TypeError("You must provide a data iterable to train on")
+        elif isinstance(data_iterable, str):
+            raise TypeError(
+                "Passed string. Input data must be iterable list of list of tokens or tuple"
+            )
+        elif not hasattr(data_iterable, "__iter__"):
+            raise TypeError("Iterable must provide __iter__ function")
+
+    def _log_train_end(self, eff_sentences: int, eff_words: int, overall_time: float):
+        """Log the end of training.
+
+        Parameters
+        ----------
+        eff_sentences : int
+            Number of effective (non-zero) sentences encountered in training.
+        eff_words : int
+            Number of effective words used in training (after ignoring unknown words).
+        overall_time : float
+            Time in seconds for the task to be completed.
+
+        """
+        logger.info(
+            f"training on {eff_sentences} effective sentences with {eff_words} effective words "
+            f"took {int(overall_time)}s with {int(eff_sentences / overall_time)} sentences/s"
+        )
+
+    def _check_pre_training_sanity(
+        self, total_sentences: int, total_words: int, average_length: int, **kwargs
+    ):
+        """Check if all available objects for training are available and compliant
+
+        Parameters
+        ----------
+        total_sentences : int
+            Number of sentences encountered while scanning
+        total_words : int
+            Number of words encountered while scanning
+        average_length : int
+            Average sentence length
+
+        """
+        if not hasattr(self, "wv") or self.wv is None:
+            raise RuntimeError("you must first load a valid KeyedVectors object")
+        if not len(self.wv.vectors):
+            raise RuntimeError(
+                "you must initialize vectors before computing sentence vectors"
+            )
+
+        if self.is_ft and not len(self.wv.vectors_ngrams):
+            raise RuntimeError(
+                "you must initialize ngram vectors before computing sentence vectors"
+            )
+        if self.is_ft and not len(self.wv.vectors_vocab):
+            raise RuntimeError(
+                "you must initialize vectors_vocab before computing sentence vectors"
+            )
+
+        if sum([self.wv.get_vecattr(w, "count") for w in self.wv.key_to_index]) == len(
+            self.wv
+        ):
+            logger.warning(
+                "The sum of the word counts is equal to its length (all word counts are 1). "
+                "Make sure to obtain proper word counts by using lang_freq for pretrained embeddings."
+            )
+
+        if not hasattr(self.sv, "vectors") or self.sv.vectors is None:
+            raise RuntimeError("initialization of Sentencevectors failed")
+        if not hasattr(self, "word_weights") or self.word_weights is None:
+            raise RuntimeError("initialization of word weights failed")
+
+        if not len(self.wv.vectors) == len(self.word_weights):
+            raise RuntimeError("Number of word vectors and weights does not match")
+
+        if self.wv.vectors.dtype != REAL:
+            raise TypeError(f"type of wv.vectors is wrong: {self.wv.vectors.dtype}")
+        if self.is_ft and self.wv.vectors_ngrams.dtype != REAL:
+            raise TypeError(
+                f"type of wv.vectors_ngrams is wrong: {self.wv.vectors_ngrams.dtype}"
+            )
+        if self.is_ft and self.wv.vectors_vocab.dtype != REAL:
+            raise TypeError(
+                f"type of wv.vectors_vocab is wrong: {self.wv.vectors_vocab.dtype}"
+            )
+        if self.sv.vectors.dtype != REAL:
+            raise TypeError(f"type of sv.vectors is wrong: {self.sv.vectors.dtype}")
+        if self.word_weights.dtype != REAL:
+            raise TypeError(f"type of word_weights is wrong: {self.word_weights.dtype}")
+
+        if total_sentences == 0 or total_words == 0 or average_length == 0:
+            raise ValueError(
+                f"scanning the sentences returned invalid values. Check the input."
+            )
+
+    def _check_post_training_sanity(self, eff_sentences: int, eff_words: int):
+        """Check if the training results make sense
+
+        Parameters
+        ----------
+        eff_sentences : int
+            Number of effective sentences encountered during training
+        eff_words : int
+            Number of effective words encountered during training
+
+        """
+        if eff_sentences == 0 or eff_words == 0:
+            raise ValueError(f"training returned invalid values. Check the input.")
+
+    def _check_indexed_sent_valid(
+        self, iterPos: int, obj: tuple, checked: int = False
+    ) -> Tuple[int, List[str]]:
+        """Performs a check if the passed object contains valid data
+
+        Parameters
+        ----------
+        iterPos : int
+            Position in file/iterable
+        obj : tuple
+            An tuple object containing the index and sentence
+
+        Returns
+        -------
+        int
+            Index of the sentence used to write to (in sv.vectors)
+        list
+            List of strings containing all words in a sentence
+
+        """
+
+        if isinstance(obj, tuple):
+            sent = obj[0]  # Faster than obj.words
+            index = obj[1]
+        else:
+            raise TypeError(f"Passed {type(obj)}: {obj}. Iterable must contain tuple.")
+
+        if not checked:
+            if not isinstance(sent, list) or not all(isinstance(w, str) for w in sent):
+                raise TypeError(
+                    f"At {iterPos}: Passed {type(sent)}: {sent}. tuple.words must contain list of str."
+                )
+            if not isinstance(index, int):
+                raise TypeError(
+                    f"At {iterPos}: Passed {type(index)}: {index}. tuple.index must contain index"
+                )
+            if index < 0:
+                raise ValueError(f"At {iterPos}: Passed negative {index}")
+        return index, sent
+
+    def _map_all_vectors_to_disk(self, mapfile_path: Path):
+        """Maps all vectors to disk
+
+        Parameters
+        ----------
+        mapfile_path : Path
+            Path where to write the vectors to
+
+        """
+        path = str(mapfile_path.absolute())
+
+        self.wv_mapfile_shapes["vectors"] = self.wv.vectors.shape
+        self.wv.vectors = self._move_ndarray_to_disk(
+            self.wv.vectors, mapfile_path=path, name="wv"
+        )
+        if self.is_ft:
+            self.wv_mapfile_shapes["vectors_vocab"] = self.wv.vectors_vocab.shape
+            self.wv_mapfile_shapes["vectors_ngrams"] = self.wv.vectors_ngrams.shape
+            self.wv.vectors_vocab = self._move_ndarray_to_disk(
+                self.wv.vectors_vocab, mapfile_path=self.wv_mapfile_path, name="vocab"
+            )
+            self.wv.vectors_ngrams = self._move_ndarray_to_disk(
+                self.wv.vectors_ngrams, mapfile_path=self.wv_mapfile_path, name="ngrams"
+            )
+
+    def _load_all_vectors_from_disk(self, mapfile_path: Path):
+        """Reads all vectors from disk
+
+        Parameters
+        ----------
+        mapfile_path : Path
+            Path where to read the vectors from
+
+        """
+        path = str(mapfile_path.absolute())
+
+        self.wv.vectors = np_memmap(
+            f"{path}_wv.vectors",
+            dtype=REAL,
+            mode="r",
+            shape=self.wv_mapfile_shapes["vectors"],
+        )
+        if self.is_ft:
+            self.wv.vectors_vocab = np_memmap(
+                f"{path}_vocab.vectors",
+                dtype=REAL,
+                mode="r",
+                shape=self.wv_mapfile_shapes["vectors_vocab"],
+            )
+            self.wv.vectors_ngrams = np_memmap(
+                f"{path}_ngrams.vectors",
+                dtype=REAL,
+                mode="r",
+                shape=self.wv_mapfile_shapes["vectors_ngrams"],
+            )
+
+    def _move_ndarray_to_disk(
+        self, vector: ndarray, mapfile_path: str, name: str = ""
+    ) -> ndarray:
+        """Moves a numpy ndarray to disk via memmap
+
+        Parameters
+        ----------
+        vector : ndarray
+            The vector to write to disk
+        mapfile_path : Path
+            Path where to write the vector to
+        name : str
+            Suffix which is appended to the path to distinguish multiple files
+
+        Returns
+        -------
+        ndarray
+            readonly ndarray to be used in further computations
+
+        """
+        shape = vector.shape
+        path = Path(f"{mapfile_path}_{name}.vectors")
+
+        if not path.exists():
+            logger.info(f"writing {name} to {path}")
+            memvecs = np_memmap(path, dtype=REAL, mode="w+", shape=shape)
+            memvecs[:] = vector[:]
+            del memvecs, vector
+        else:
+            # If multiple instances of this class exist, all can access the same files
+            logger.info(f"loading pre-existing {name} from {path}")
+
+        readonly_memvecs = np_memmap(path, dtype=REAL, mode="r", shape=shape)
+        return readonly_memvecs
+
+    def _get_thread_working_mem(self) -> Tuple[ndarray, ndarray]:
+        """Computes the memory used per worker thread.
+
+        Returns
+        -------
+        np.ndarray
+            Each worker threads private work memory.
+
+        """
+        mem = zeros_aligned(self.sv.vector_size, dtype=REAL)
+        oov_mem = zeros_aligned((self.batch_words, self.batch_ngrams), dtype=uINT)
+        return (mem, oov_mem)
+
+    def _do_train_job(
+        self, data_iterable: List[tuple], target: ndarray, memory: ndarray
+    ) -> Tuple[int, int]:
+        """ Function to be called on a batch of sentences. Returns eff sentences/words """
+        raise NotImplementedError()
+
+    def _pre_train_calls(self, **kwargs):
+        """ Function calls to perform before training """
+        raise NotImplementedError()
+
+    def _post_train_calls(self, **kwargs):
+        """ Function calls to perform after training, such as computing eigenvectors """
+        raise NotImplementedError()
+
+    def _post_inference_calls(self, **kwargs):
+        """Function calls to perform after training & inference
+        Examples include the removal of components
+        """
+        raise NotImplementedError()
+
+    def _check_parameter_sanity(self, **kwargs):
+        """ Check the sanity of all child paramters """
+        raise NotImplementedError()
+
+    def _check_dtype_santiy(self, **kwargs):
+        """ Check the dtypes of all child attributes """
+        raise NotImplementedError()
+
+    @classmethod
+    def load(cls, *args, **kwargs):
+        """Load a previously saved :class:`~fse.models.base_s2v.BaseSentence2VecModel`.
+
+        Parameters
+        ----------
+        fname : str
+            Path to the saved file.
+
+        Returns
+        -------
+        :class:`~fse.models.base_s2v.BaseSentence2VecModel`
+            Loaded model.
+
+        """
+        # This is kind of an ugly hack because I cannot directly modify the save routine of the
+        # correpsonding KeyedVectors Files, as a memmap file makes the npy files irrelvant
+        model = super(BaseSentence2VecModel, cls).load(*args, **kwargs)
+
+        if model.wv_mapfile_path is not None:
+            model._load_all_vectors_from_disk(model.wv_mapfile_path)
+        model.wv_mapfile_shapes = None
+
+        set_madvise_for_mmap()
+
+        return model
+
+    def save(self, *args, **kwargs):
+        """Save the model.
+        This saved model can be loaded again using :func:`~fse.models.base_s2v.BaseSentence2VecModel.load`
+
+        Parameters
+        ----------
+        fname : str
+            Path to the file.
+
+        """
+        # Manually removes vectors from the wv class because we cannot modify the save method
+        if self.wv_mapfile_path is not None:
+            self.wv.vectors = None
+        super(BaseSentence2VecModel, self).save(*args, **kwargs)
+
+    def scan_sentences(
+        self, sentences: List[tuple] = None, progress_per: int = 5
+    ) -> Dict[str, int]:
+        """Performs an initial scan of the data and reports all corresponding statistics
+
+        Parameters
+        ----------
+        sentences : (list, iterable)
+            An iterable consisting of tuple objects
+        progress_per : int
+            Number of seconds to pass before reporting the scan progress
+
+        Returns
+        -------
+        dict
+            Dictionary containing the scan statistics
+
+        """
+        logger.info("scanning all indexed sentences and their word counts")
+
+        current_time = time()
+        total_sentences = 0
+        total_words = 0
+        average_length = 0
+        empty_sentences = 0
+        max_index = 0
+        checked_sentences = (
+            0  # We only check the first item to not constrain runtime so much
+        )
+
+        for i, obj in enumerate(sentences):
+            index, sent = self._check_indexed_sent_valid(
+                iterPos=i, obj=obj, checked=checked_sentences
+            )
+            checked_sentences += 1
+            if time() - current_time > progress_per:
+                current_time = time()
+                logger.info(
+                    f"SCANNING : finished {total_sentences} sentences with {total_words} words"
+                )
+
+            max_index = max(max_index, index)
+            total_sentences += 1
+            total_words += len(sent)
+
+            if not len(sent):
+                empty_sentences += 1
+
+        if empty_sentences:
+            logger.warning(f"found {empty_sentences} empty sentences")
+
+        if max_index >= total_sentences:
+            raise RuntimeError(
+                f"Index {max_index} is larger than number of sentences {total_sentences}"
+            )
+
+        average_length = int(total_words / total_sentences)
+
+        logger.info(
+            f"finished scanning {total_sentences} sentences with an average length of {average_length} and {total_words} total words"
+        )
+        statistics = {
+            "total_sentences": total_sentences,
+            "total_words": total_words,
+            "average_length": average_length,
+            "empty_sentences": empty_sentences,
+            "max_index": max_index + 1,
+        }
+        return statistics
+
+    def estimate_memory(
+        self, max_index: int, report: dict = None, **kwargs
+    ) -> Dict[str, int]:
+        """Estimate the size of the sentence embedding
+
+        Parameters
+        ----------
+        max_index : int
+            Maximum index found during the initial scan
+        report : dict
+            Report of subclasses
+
+        Returns
+        -------
+        dict
+            Dictionary of estimated memory sizes
+
+        """
+        vocab_size = len(self.wv.vectors)
+
+        report = report or {}
+        report["Word Weights"] = vocab_size * dtype(REAL).itemsize
+        report["Word Vectors"] = vocab_size * self.wv.vector_size * dtype(REAL).itemsize
+        report["Sentence Vectors"] = (
+            max_index * self.wv.vector_size * dtype(REAL).itemsize
+        )
+        if self.is_ft:
+            report["Vocab Vectors"] = (
+                vocab_size * self.wv.vector_size * dtype(REAL).itemsize
+            )
+            report["Ngram Vectors"] = (
+                self.wv.vectors_ngrams.shape[0]
+                * self.wv.vector_size
+                * dtype(REAL).itemsize
+            )
+        report["Total"] = sum(report.values())
+        mb_size = int(report["Total"] / 1024 ** 2)
+        logger.info(
+            f"estimated memory for {max_index} sentences with "
+            f"{self.wv.vector_size} dimensions and {vocab_size} vocabulary: "
+            f"{mb_size} MB ({int(mb_size / 1024)} GB)"
+        )
+        if report["Total"] >= 0.95 * virtual_memory()[1]:
+            logger.warning(
+                "The embeddings will likely not fit into RAM. Consider to use mapfile_path"
+            )
+        return report
+
+    def train(
+        self,
+        sentences: List[tuple] = None,
+        update: bool = False,
+        queue_factor: int = 2,
+        report_delay: int = 5,
+    ) -> Tuple[int, int]:
+        """Main routine to train an embedding. This method writes all sentences vectors into sv.vectors and is
+        used for computing embeddings for large chunks of data. This method also handles post-training transformations,
+        such as computing the SVD of the sentence vectors.
+
+        Parameters
+        ----------
+        sentences : (list, iterable)
+            An iterable consisting of tuple objects
+        update : bool
+            If bool is True, the sentence vector matrix will be updated in size (even with memmap)
+        queue_factor : int
+            Multiplier for size of queue -> size = number of workers * queue_factor.
+        report_delay : int
+            Number of seconds between two consecutive progress report messages in the logger.
+
+        Returns
+        -------
+        int, int
+            Count of effective sentences and words encountered
+
+        """
+        self._check_input_data_sanity(sentences)
+        statistics = self.scan_sentences(sentences)
+
+        self._check_pre_training_sanity(**statistics)
+
+        self.estimate_memory(**statistics)
+        self.prep.prepare_vectors(
+            sv=self.sv, total_sentences=statistics["max_index"], update=update
+        )
+
+        # Preform post-tain calls (i.e weight computation)
+        self._pre_train_calls(**statistics)
+        self._check_parameter_sanity()
+        self._check_dtype_santiy()
+        start_time = time()
+
+        logger.info(f"begin training")
+
+        _, eff_sentences, eff_words = self._train_manager(
+            data_iterable=sentences,
+            total_sentences=statistics["total_sentences"],
+            queue_factor=queue_factor,
+            report_delay=report_delay,
+        )
+
+        overall_time = time() - start_time
+
+        self._check_post_training_sanity(
+            eff_sentences=eff_sentences, eff_words=eff_words
+        )
+
+        # Preform post-tain calls (i.e principal component removal)
+        self._post_train_calls()
+
+        self._log_train_end(
+            eff_sentences=eff_sentences, eff_words=eff_words, overall_time=overall_time
+        )
+
+        return eff_sentences, eff_words
+
+    def infer(self, sentences: List[tuple] = None, use_norm=False) -> ndarray:
+        """Secondary routine to train an embedding. This method is essential for small batches of sentences,
+        which require little computation. Note: This method does not apply post-training transformations,
+        only post inference calls (such as removing principal components).
+
+        Parameters
+        ----------
+        sentences : (list, iterable)
+            An iterable consisting of tuple objects
+        use_norm : bool
+            If bool is True, the sentence vectors will be L2 normalized (unit euclidean length)
+
+        Returns
+        -------
+        ndarray
+            Computed sentence vectors
+
+        """
+        self._check_input_data_sanity(sentences)
+
+        statistics = self.scan_sentences(sentences)
+
+        output = zeros((statistics["max_index"], self.sv.vector_size), dtype=REAL)
+        mem = self._get_thread_working_mem()
+
+        job_batch, batch_size = [], 0
+        for data_idx, data in enumerate(sentences):
+            data_length = len(data[0])
+            if batch_size + data_length <= self.batch_words:
+                job_batch.append(data)
+                batch_size += data_length
+            else:
+                self._do_train_job(data_iterable=job_batch, target=output, memory=mem)
+                job_batch, batch_size = [data], data_length
+        if job_batch:
+            self._do_train_job(data_iterable=job_batch, target=output, memory=mem)
+
+        self._post_inference_calls(output=output)
+
+        if use_norm:
+            output = _l2_norm(output)
+        return output
+
+    def _train_manager(
+        self,
+        data_iterable: List[tuple],
+        total_sentences: int = None,
+        queue_factor: int = 2,
+        report_delay: int = 5,
+    ):
+        """Manager for the multi-core implementation. Directly adapted from gensim
+
+        Parameters
+        ----------
+        data_iterable : (list, iterable)
+            An iterable consisting of tuple objects. This will be split in chunks and these chunks will be pushed to the queue.
+        total_sentences : int
+            Number of sentences found during the initial scan
+        queue_factor : int
+            Multiplier for size of queue -> size = number of workers * queue_factor.
+        report_delay : int
+            Number of seconds between two consecutive progress report messages in the logger.
+
+        """
+        job_queue = Queue(maxsize=queue_factor * self.workers)
+        progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)
+
+        # WORKING Threads
+        workers = [
+            threading.Thread(target=self._worker_loop, args=(job_queue, progress_queue))
+            for _ in range(self.workers)
+        ]
+        # JOB PRODUCER
+        workers.append(
+            threading.Thread(target=self._job_producer, args=(data_iterable, job_queue))
+        )
+
+        for thread in workers:
+            thread.daemon = True  # make interrupting the process with ctrl+c easier
+            thread.start()
+
+        jobs, eff_sentences, eff_words = self._log_train_progress(
+            progress_queue, total_sentences=total_sentences, report_delay=report_delay
+        )
+        return jobs, eff_sentences, eff_words
+
+    def _worker_loop(self, job_queue, progress_queue):
+        """Train the model, lifting batches of data from the queue.
+
+        This function will be called in parallel by multiple workers (threads or processes) to make
+        optimal use of multicore machines.
+
+        Parameters
+        ----------
+        job_queue : Queue of (list of tuple)
+            A queue of jobs still to be processed. The worker will take up jobs from this queue.
+            Each job is represented as a batch of tuple.
+        progress_queue : Queue of (int, int, int)
+            A queue of progress reports. Each report is represented as a tuple of these 3 elements:
+                * Size of job processed
+                * Effective sentences encountered in traning
+                * Effective words encountered in traning
+
+        """
+        mem = self._get_thread_working_mem()
+        jobs_processed = 0
+        while True:
+            job = job_queue.get()
+            if job is None:
+                progress_queue.put(None)
+                # no more jobs => quit this worker
+                break
+            eff_sentences, eff_words = self._do_train_job(
+                data_iterable=job, target=self.sv.vectors, memory=mem
+            )
+            progress_queue.put((len(job), eff_sentences, eff_words))
+            jobs_processed += 1
+        logger.debug(f"worker exiting, processed {jobs_processed} jobs")
+
+    def _job_producer(self, data_iterable: List[tuple], job_queue: Queue):
+        """Fill the jobs queue using the data found in the input stream.
+
+        Each job is represented as a batch of tuple
+
+        Parameters
+        ----------
+        data_iterable : (list, iterable)
+            An iterable consisting of tuple objects. This will be split in chunks and these chunks will be pushed to the queue.
+        job_queue : Queue of (list of tuple)
+            A queue of jobs still to be processed. The worker will take up jobs from this queue.
+            Each job is represented as a batch of tuple.
+
+        """
+
+        job_batch, batch_size = [], 0
+        job_no = 0
+
+        for data_idx, data in enumerate(data_iterable):
+            data_length = len(data[0])
+            if batch_size + data_length <= self.batch_words:
+                job_batch.append(data)
+                batch_size += data_length
+            else:
+                job_no += 1
+                job_queue.put(job_batch)
+                job_batch, batch_size = [data], data_length
+
+        if job_batch:
+            job_no += 1
+            job_queue.put(job_batch)
+
+        for _ in range(self.workers):
+            job_queue.put(None)
+        logger.debug(f"job loop exiting, total {job_no} jobs")
+
+    def _log_train_progress(
+        self, progress_queue: Queue, total_sentences: int = None, report_delay: int = 5
+    ):
+        """Log the training process after a couple of seconds.
+
+        Parameters
+        ----------
+        progress_queue : Queue of (int, int, int)
+            A queue of progress reports. Each report is represented as a tuple of these 3 elements:
+                * Size of job processed
+                * Effective sentences encountered in traning
+                * Effective words encountered in traning
+        total_sentences : int
+            Number of sentences found during the initial scan
+        report_delay : int
+            Number of seconds between two consecutive progress report messages in the logger.
+
+        Returns
+        -------
+        int, int, int
+            number of jobs, effective sentences, and effective words in traning
+
+        """
+        jobs, eff_sentences, eff_words = 0, 0, 0
+        unfinished_worker_count = self.workers
+        start_time = time()
+        sentence_inc = 0
+        while unfinished_worker_count > 0:
+            report = progress_queue.get()
+            if report is None:  # a thread reporting that it finished
+                unfinished_worker_count -= 1
+                logger.info(
+                    f"worker thread finished; awaiting finish of {unfinished_worker_count} more threads"
+                )
+                continue
+
+            j, s, w = report
+            jobs += j
+            eff_sentences += s
+            eff_words += w
+            if time() - start_time >= report_delay:
+                start_time = time()
+
+                logger.info(
+                    "PROGRESS : finished {:3.2f}% with {} sentences and {} words, {} sentences/s".format(
+                        100 * (eff_sentences / total_sentences),
+                        eff_sentences,
+                        eff_words,
+                        int((eff_sentences - sentence_inc) / report_delay),
+                    )
+                )
+                sentence_inc = eff_sentences
+
+        return jobs, eff_sentences, eff_words
+
+

Ancestors

+
    +
  • gensim.utils.SaveLoad
  • +
+

Subclasses

+ +

Static methods

+
+
+def load(*args, **kwargs) +
+
+

Load a previously saved :class:~fse.models.base_s2v.BaseSentence2VecModel.

+

Parameters

+
+
fname : str
+
Path to the saved file.
+
+

Returns

+

:class:~fse.models.base_s2v.BaseSentence2VecModel +Loaded model.

+
+ +Expand source code + +
@classmethod
+def load(cls, *args, **kwargs):
+    """Load a previously saved :class:`~fse.models.base_s2v.BaseSentence2VecModel`.
+
+    Parameters
+    ----------
+    fname : str
+        Path to the saved file.
+
+    Returns
+    -------
+    :class:`~fse.models.base_s2v.BaseSentence2VecModel`
+        Loaded model.
+
+    """
+    # This is kind of an ugly hack because I cannot directly modify the save routine of the
+    # correpsonding KeyedVectors Files, as a memmap file makes the npy files irrelvant
+    model = super(BaseSentence2VecModel, cls).load(*args, **kwargs)
+
+    if model.wv_mapfile_path is not None:
+        model._load_all_vectors_from_disk(model.wv_mapfile_path)
+    model.wv_mapfile_shapes = None
+
+    set_madvise_for_mmap()
+
+    return model
+
+
+
+

Methods

+
+
+def estimate_memory(self, max_index: int, report: dict = None, **kwargs) ‑> Dict[str, int] +
+
+

Estimate the size of the sentence embedding

+

Parameters

+
+
max_index : int
+
Maximum index found during the initial scan
+
report : dict
+
Report of subclasses
+
+

Returns

+
+
dict
+
Dictionary of estimated memory sizes
+
+
+ +Expand source code + +
def estimate_memory(
+    self, max_index: int, report: dict = None, **kwargs
+) -> Dict[str, int]:
+    """Estimate the size of the sentence embedding
+
+    Parameters
+    ----------
+    max_index : int
+        Maximum index found during the initial scan
+    report : dict
+        Report of subclasses
+
+    Returns
+    -------
+    dict
+        Dictionary of estimated memory sizes
+
+    """
+    vocab_size = len(self.wv.vectors)
+
+    report = report or {}
+    report["Word Weights"] = vocab_size * dtype(REAL).itemsize
+    report["Word Vectors"] = vocab_size * self.wv.vector_size * dtype(REAL).itemsize
+    report["Sentence Vectors"] = (
+        max_index * self.wv.vector_size * dtype(REAL).itemsize
+    )
+    if self.is_ft:
+        report["Vocab Vectors"] = (
+            vocab_size * self.wv.vector_size * dtype(REAL).itemsize
+        )
+        report["Ngram Vectors"] = (
+            self.wv.vectors_ngrams.shape[0]
+            * self.wv.vector_size
+            * dtype(REAL).itemsize
+        )
+    report["Total"] = sum(report.values())
+    mb_size = int(report["Total"] / 1024 ** 2)
+    logger.info(
+        f"estimated memory for {max_index} sentences with "
+        f"{self.wv.vector_size} dimensions and {vocab_size} vocabulary: "
+        f"{mb_size} MB ({int(mb_size / 1024)} GB)"
+    )
+    if report["Total"] >= 0.95 * virtual_memory()[1]:
+        logger.warning(
+            "The embeddings will likely not fit into RAM. Consider to use mapfile_path"
+        )
+    return report
+
+
+
+def infer(self, sentences: List[tuple] = None, use_norm=False) ‑> numpy.ndarray +
+
+

Secondary routine to train an embedding. This method is essential for small batches of sentences, +which require little computation. Note: This method does not apply post-training transformations, +only post inference calls (such as removing principal components).

+

Parameters

+
+
sentences : (list, iterable)
+
An iterable consisting of tuple objects
+
use_norm : bool
+
If bool is True, the sentence vectors will be L2 normalized (unit euclidean length)
+
+

Returns

+
+
ndarray
+
Computed sentence vectors
+
+
+ +Expand source code + +
def infer(self, sentences: List[tuple] = None, use_norm=False) -> ndarray:
+    """Secondary routine to train an embedding. This method is essential for small batches of sentences,
+    which require little computation. Note: This method does not apply post-training transformations,
+    only post inference calls (such as removing principal components).
+
+    Parameters
+    ----------
+    sentences : (list, iterable)
+        An iterable consisting of tuple objects
+    use_norm : bool
+        If bool is True, the sentence vectors will be L2 normalized (unit euclidean length)
+
+    Returns
+    -------
+    ndarray
+        Computed sentence vectors
+
+    """
+    self._check_input_data_sanity(sentences)
+
+    statistics = self.scan_sentences(sentences)
+
+    output = zeros((statistics["max_index"], self.sv.vector_size), dtype=REAL)
+    mem = self._get_thread_working_mem()
+
+    job_batch, batch_size = [], 0
+    for data_idx, data in enumerate(sentences):
+        data_length = len(data[0])
+        if batch_size + data_length <= self.batch_words:
+            job_batch.append(data)
+            batch_size += data_length
+        else:
+            self._do_train_job(data_iterable=job_batch, target=output, memory=mem)
+            job_batch, batch_size = [data], data_length
+    if job_batch:
+        self._do_train_job(data_iterable=job_batch, target=output, memory=mem)
+
+    self._post_inference_calls(output=output)
+
+    if use_norm:
+        output = _l2_norm(output)
+    return output
+
+
+
+def save(self, *args, **kwargs) +
+
+

Save the model. +This saved model can be loaded again using :func:~fse.models.base_s2v.BaseSentence2VecModel.load

+

Parameters

+
+
fname : str
+
Path to the file.
+
+
+ +Expand source code + +
def save(self, *args, **kwargs):
+    """Save the model.
+    This saved model can be loaded again using :func:`~fse.models.base_s2v.BaseSentence2VecModel.load`
+
+    Parameters
+    ----------
+    fname : str
+        Path to the file.
+
+    """
+    # Manually removes vectors from the wv class because we cannot modify the save method
+    if self.wv_mapfile_path is not None:
+        self.wv.vectors = None
+    super(BaseSentence2VecModel, self).save(*args, **kwargs)
+
+
+
+def scan_sentences(self, sentences: List[tuple] = None, progress_per: int = 5) ‑> Dict[str, int] +
+
+

Performs an initial scan of the data and reports all corresponding statistics

+

Parameters

+
+
sentences : (list, iterable)
+
An iterable consisting of tuple objects
+
progress_per : int
+
Number of seconds to pass before reporting the scan progress
+
+

Returns

+
+
dict
+
Dictionary containing the scan statistics
+
+
+ +Expand source code + +
def scan_sentences(
+    self, sentences: List[tuple] = None, progress_per: int = 5
+) -> Dict[str, int]:
+    """Performs an initial scan of the data and reports all corresponding statistics
+
+    Parameters
+    ----------
+    sentences : (list, iterable)
+        An iterable consisting of tuple objects
+    progress_per : int
+        Number of seconds to pass before reporting the scan progress
+
+    Returns
+    -------
+    dict
+        Dictionary containing the scan statistics
+
+    """
+    logger.info("scanning all indexed sentences and their word counts")
+
+    current_time = time()
+    total_sentences = 0
+    total_words = 0
+    average_length = 0
+    empty_sentences = 0
+    max_index = 0
+    checked_sentences = (
+        0  # We only check the first item to not constrain runtime so much
+    )
+
+    for i, obj in enumerate(sentences):
+        index, sent = self._check_indexed_sent_valid(
+            iterPos=i, obj=obj, checked=checked_sentences
+        )
+        checked_sentences += 1
+        if time() - current_time > progress_per:
+            current_time = time()
+            logger.info(
+                f"SCANNING : finished {total_sentences} sentences with {total_words} words"
+            )
+
+        max_index = max(max_index, index)
+        total_sentences += 1
+        total_words += len(sent)
+
+        if not len(sent):
+            empty_sentences += 1
+
+    if empty_sentences:
+        logger.warning(f"found {empty_sentences} empty sentences")
+
+    if max_index >= total_sentences:
+        raise RuntimeError(
+            f"Index {max_index} is larger than number of sentences {total_sentences}"
+        )
+
+    average_length = int(total_words / total_sentences)
+
+    logger.info(
+        f"finished scanning {total_sentences} sentences with an average length of {average_length} and {total_words} total words"
+    )
+    statistics = {
+        "total_sentences": total_sentences,
+        "total_words": total_words,
+        "average_length": average_length,
+        "empty_sentences": empty_sentences,
+        "max_index": max_index + 1,
+    }
+    return statistics
+
+
+
+def train(self, sentences: List[tuple] = None, update: bool = False, queue_factor: int = 2, report_delay: int = 5) ‑> Tuple[int, int] +
+
+

Main routine to train an embedding. This method writes all sentences vectors into sv.vectors and is +used for computing embeddings for large chunks of data. This method also handles post-training transformations, +such as computing the SVD of the sentence vectors.

+

Parameters

+
+
sentences : (list, iterable)
+
An iterable consisting of tuple objects
+
update : bool
+
If bool is True, the sentence vector matrix will be updated in size (even with memmap)
+
queue_factor : int
+
Multiplier for size of queue -> size = number of workers * queue_factor.
+
report_delay : int
+
Number of seconds between two consecutive progress report messages in the logger.
+
+

Returns

+
+
int, int
+
Count of effective sentences and words encountered
+
+
+ +Expand source code + +
def train(
+    self,
+    sentences: List[tuple] = None,
+    update: bool = False,
+    queue_factor: int = 2,
+    report_delay: int = 5,
+) -> Tuple[int, int]:
+    """Main routine to train an embedding. This method writes all sentences vectors into sv.vectors and is
+    used for computing embeddings for large chunks of data. This method also handles post-training transformations,
+    such as computing the SVD of the sentence vectors.
+
+    Parameters
+    ----------
+    sentences : (list, iterable)
+        An iterable consisting of tuple objects
+    update : bool
+        If bool is True, the sentence vector matrix will be updated in size (even with memmap)
+    queue_factor : int
+        Multiplier for size of queue -> size = number of workers * queue_factor.
+    report_delay : int
+        Number of seconds between two consecutive progress report messages in the logger.
+
+    Returns
+    -------
+    int, int
+        Count of effective sentences and words encountered
+
+    """
+    self._check_input_data_sanity(sentences)
+    statistics = self.scan_sentences(sentences)
+
+    self._check_pre_training_sanity(**statistics)
+
+    self.estimate_memory(**statistics)
+    self.prep.prepare_vectors(
+        sv=self.sv, total_sentences=statistics["max_index"], update=update
+    )
+
+    # Preform post-tain calls (i.e weight computation)
+    self._pre_train_calls(**statistics)
+    self._check_parameter_sanity()
+    self._check_dtype_santiy()
+    start_time = time()
+
+    logger.info(f"begin training")
+
+    _, eff_sentences, eff_words = self._train_manager(
+        data_iterable=sentences,
+        total_sentences=statistics["total_sentences"],
+        queue_factor=queue_factor,
+        report_delay=report_delay,
+    )
+
+    overall_time = time() - start_time
+
+    self._check_post_training_sanity(
+        eff_sentences=eff_sentences, eff_words=eff_words
+    )
+
+    # Preform post-tain calls (i.e principal component removal)
+    self._post_train_calls()
+
+    self._log_train_end(
+        eff_sentences=eff_sentences, eff_words=eff_words, overall_time=overall_time
+    )
+
+    return eff_sentences, eff_words
+
+
+
+
+
+class BaseSentence2VecPreparer +
+
+

Contains helper functions to perpare the weights for the training of BaseSentence2VecModel

+
+ +Expand source code + +
class BaseSentence2VecPreparer(SaveLoad):
+    """ Contains helper functions to perpare the weights for the training of BaseSentence2VecModel """
+
+    def prepare_vectors(
+        self, sv: SentenceVectors, total_sentences: int, update: bool = False
+    ):
+        """Build tables and model weights based on final vocabulary settings."""
+        if not update:
+            self.reset_vectors(sv, total_sentences)
+        else:
+            self.update_vectors(sv, total_sentences)
+
+    def reset_vectors(self, sv: SentenceVectors, total_sentences: int):
+        """Initialize all sentence vectors to zero and overwrite existing files"""
+        logger.info(f"initializing sentence vectors for {total_sentences} sentences")
+        if sv.mapfile_path:
+            sv.vectors = np_memmap(
+                str(sv.mapfile_path) + ".vectors",
+                dtype=REAL,
+                mode="w+",
+                shape=(total_sentences, sv.vector_size),
+            )
+        else:
+            sv.vectors = empty((total_sentences, sv.vector_size), dtype=REAL)
+
+        for i in range(total_sentences):
+            sv.vectors[i] = full(shape=sv.vector_size, fill_value=EPS, dtype=REAL)
+        sv.vectors_norm = None
+
+    def update_vectors(self, sv: SentenceVectors, total_sentences: int):
+        """Given existing sentence vectors, append new ones"""
+        logger.info(f"appending sentence vectors for {total_sentences} sentences")
+        sentences_before = len(sv.vectors)
+        sentences_after = len(sv.vectors) + total_sentences
+
+        if sv.mapfile_path:
+            sv.vectors = np_memmap(
+                str(sv.mapfile_path) + ".vectors",
+                dtype=REAL,
+                mode="r+",
+                shape=(sentences_after, sv.vector_size),
+            )
+            for i in range(sentences_before, sentences_after):
+                sv.vectors[i] = full(shape=sv.vector_size, fill_value=EPS, dtype=REAL)
+        else:
+            newvectors = empty((total_sentences, sv.vector_size), dtype=REAL)
+            for i in range(total_sentences):
+                newvectors[i] = full(shape=sv.vector_size, fill_value=EPS, dtype=REAL)
+            sv.vectors = vstack([sv.vectors, newvectors])
+        sv.vectors_norm = None
+
+

Ancestors

+
    +
  • gensim.utils.SaveLoad
  • +
+

Methods

+
+
+def prepare_vectors(self, sv: SentenceVectors, total_sentences: int, update: bool = False) +
+
+

Build tables and model weights based on final vocabulary settings.

+
+ +Expand source code + +
def prepare_vectors(
+    self, sv: SentenceVectors, total_sentences: int, update: bool = False
+):
+    """Build tables and model weights based on final vocabulary settings."""
+    if not update:
+        self.reset_vectors(sv, total_sentences)
+    else:
+        self.update_vectors(sv, total_sentences)
+
+
+
+def reset_vectors(self, sv: SentenceVectors, total_sentences: int) +
+
+

Initialize all sentence vectors to zero and overwrite existing files

+
+ +Expand source code + +
def reset_vectors(self, sv: SentenceVectors, total_sentences: int):
+    """Initialize all sentence vectors to zero and overwrite existing files"""
+    logger.info(f"initializing sentence vectors for {total_sentences} sentences")
+    if sv.mapfile_path:
+        sv.vectors = np_memmap(
+            str(sv.mapfile_path) + ".vectors",
+            dtype=REAL,
+            mode="w+",
+            shape=(total_sentences, sv.vector_size),
+        )
+    else:
+        sv.vectors = empty((total_sentences, sv.vector_size), dtype=REAL)
+
+    for i in range(total_sentences):
+        sv.vectors[i] = full(shape=sv.vector_size, fill_value=EPS, dtype=REAL)
+    sv.vectors_norm = None
+
+
+
+def update_vectors(self, sv: SentenceVectors, total_sentences: int) +
+
+

Given existing sentence vectors, append new ones

+
+ +Expand source code + +
def update_vectors(self, sv: SentenceVectors, total_sentences: int):
+    """Given existing sentence vectors, append new ones"""
+    logger.info(f"appending sentence vectors for {total_sentences} sentences")
+    sentences_before = len(sv.vectors)
+    sentences_after = len(sv.vectors) + total_sentences
+
+    if sv.mapfile_path:
+        sv.vectors = np_memmap(
+            str(sv.mapfile_path) + ".vectors",
+            dtype=REAL,
+            mode="r+",
+            shape=(sentences_after, sv.vector_size),
+        )
+        for i in range(sentences_before, sentences_after):
+            sv.vectors[i] = full(shape=sv.vector_size, fill_value=EPS, dtype=REAL)
+    else:
+        newvectors = empty((total_sentences, sv.vector_size), dtype=REAL)
+        for i in range(total_sentences):
+            newvectors[i] = full(shape=sv.vector_size, fill_value=EPS, dtype=REAL)
+        sv.vectors = vstack([sv.vectors, newvectors])
+    sv.vectors_norm = None
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/fse/models/index.html b/docs/fse/models/index.html new file mode 100644 index 0000000..770ff92 --- /dev/null +++ b/docs/fse/models/index.html @@ -0,0 +1,106 @@ + + + + + + +fse.models API documentation + + + + + + + + + + + +
+
+
+

Module fse.models

+
+
+
+ +Expand source code + +
from .average import Average
+from .sif import SIF
+from .usif import uSIF
+from .sentencevectors import SentenceVectors
+
+
+
+

Sub-modules

+
+
fse.models.average
+
+

This module implements the base class to compute average representations for sentences, using highly optimized C routines, +data streaming and Pythonic …

+
+
fse.models.average_inner
+
+

Optimized cython functions for computing sentence embeddings

+
+
fse.models.base_s2v
+
+

Base class containing common methods for training, using & evaluating sentence embeddings. +A lot of the code is based on Gensim. I have to thank Radim …

+
+
fse.models.sentencevectors
+
+
+
+
fse.models.sif
+
+
+
+
fse.models.usif
+
+
+
+
fse.models.utils
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/fse/models/sentencevectors.html b/docs/fse/models/sentencevectors.html new file mode 100644 index 0000000..a07f0a9 --- /dev/null +++ b/docs/fse/models/sentencevectors.html @@ -0,0 +1,1649 @@ + + + + + + +fse.models.sentencevectors API documentation + + + + + + + + + + + +
+
+
+

Module fse.models.sentencevectors

+
+
+
+ +Expand source code + +
#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Author: Oliver Borchers
+# Copyright (C) Oliver Borchers
+
+
+from __future__ import division
+
+from fse.inputs import IndexedList, IndexedLineDocument
+
+from fse.models.utils import set_madvise_for_mmap
+
+from gensim.models.keyedvectors import KeyedVectors
+
+from numpy import (
+    dot,
+    float32 as REAL,
+    memmap as np_memmap,
+    array,
+    zeros,
+    vstack,
+    sqrt,
+    newaxis,
+    integer,
+    ndarray,
+)
+
+from gensim import utils, matutils
+
+from typing import List, Tuple, Union
+
+from pathlib import Path
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class SentenceVectors(utils.SaveLoad):
+    def __init__(self, vector_size: int, mapfile_path: str = None):
+
+        set_madvise_for_mmap()
+
+        self.vector_size = vector_size  # Size of vectors
+        self.vectors = zeros((0, vector_size), REAL)  # Vectors for sentences
+        self.vectors_norm = None
+
+        # File for numpy memmap
+        self.mapfile_path = Path(mapfile_path) if mapfile_path is not None else None
+        self.mapfile_shape = None
+
+    def __getitem__(self, entities: int) -> ndarray:
+        """Get vector representation of `entities`.
+
+        Parameters
+        ----------
+        entities : {int, list of int}
+            Index or sequence of entities.
+
+        Returns
+        -------
+        numpy.ndarray
+            Vector representation for `entities` (1D if `entities` is int, otherwise - 2D).
+
+        """
+
+        if isinstance(
+            entities,
+            (
+                int,
+                integer,
+            ),
+        ):
+            return self.get_vector(entities)
+
+        return vstack([self.get_vector(e) for e in entities])
+
+    def __contains__(self, index: int) -> bool:
+        if isinstance(
+            index,
+            (
+                int,
+                integer,
+            ),
+        ):
+            return index < len(self)
+        else:
+            raise KeyError(f"index {index} is not a valid index")
+
+    def __len__(self) -> int:
+        return len(self.vectors)
+
+    def _load_all_vectors_from_disk(self, mapfile_path: Path):
+        """ Reads all vectors from disk """
+        path = str(mapfile_path.absolute())
+        self.vectors = np_memmap(
+            f"{path}.vectors", dtype=REAL, mode="r+", shape=self.mapfile_shape
+        )
+
+    def save(self, *args, **kwargs):
+        """Save object.
+
+        Parameters
+        ----------
+        fname : str
+            Path to the output file.
+
+        See Also
+        --------
+        :meth:`~gensim.models.keyedvectors.Doc2VecKeyedVectors.load`
+            Load object.
+
+        """
+        self.mapfile_shape = self.vectors.shape
+        ignore = ["vectors_norm"]
+        # don't bother storing the cached normalized vectors
+        if self.mapfile_path is not None:
+            ignore.append("vectors")
+        kwargs["ignore"] = kwargs.get("ignore", ignore)
+        super(SentenceVectors, self).save(*args, **kwargs)
+
+    @classmethod
+    def load(cls, fname_or_handle, **kwargs):
+        # TODO: Unittests
+        sv = super(SentenceVectors, cls).load(fname_or_handle, **kwargs)
+        path = sv.mapfile_path
+        if path is not None:
+            sv._load_all_vectors_from_disk(mapfile_path=path)
+        set_madvise_for_mmap()
+        return sv
+
+    def get_vector(self, index: int, use_norm: bool = False) -> ndarray:
+        """Get sentence representations in vector space, as a 1D numpy array.
+
+        Parameters
+        ----------
+        index : int
+            Input index
+        norm : bool, optional
+            If True - resulting vector will be L2-normalized (unit euclidean length).
+
+        Returns
+        -------
+        numpy.ndarray
+            Vector representation of index.
+
+        Raises
+        ------
+        KeyError
+            If index out of bounds.
+
+        """
+        if index in self:
+            if use_norm:
+                result = self.vectors_norm[index]
+            else:
+                result = self.vectors[index]
+
+            result.setflags(write=False)
+            return result
+        else:
+            raise KeyError("index {index} not found")
+
+    def init_sims(self, replace: bool = False):
+        """Precompute L2-normalized vectors.
+
+        Parameters
+        ----------
+        replace : bool, optional
+            If True - forget the original vectors and only keep the normalized ones = saves lots of memory!
+        """
+        if getattr(self, "vectors_norm", None) is None or replace:
+            logger.info("precomputing L2-norms of sentence vectors")
+            if not replace and self.mapfile_path is not None:
+                self.vectors_norm = np_memmap(
+                    self.mapfile_path + ".vectors_norm",
+                    dtype=REAL,
+                    mode="w+",
+                    shape=self.vectors.shape,
+                )
+            self.vectors_norm = _l2_norm(self.vectors, replace=replace)
+
+    def similarity(self, d1: int, d2: int) -> float:
+        """Compute cosine similarity between two sentences from the training set.
+
+        Parameters
+        ----------
+        d1 : int
+            index of sentence
+        d2 : int
+            index of sentence
+
+        Returns
+        -------
+        float
+            The cosine similarity between the vectors of the two sentences.
+
+        """
+        return dot(matutils.unitvec(self[d1]), matutils.unitvec(self[d2]))
+
+    def distance(self, d1: int, d2: int) -> float:
+        """Compute cosine similarity between two sentences from the training set.
+
+        Parameters
+        ----------
+        d1 : int
+            index of sentence
+        d2 : int
+            index of sentence
+
+        Returns
+        -------
+        float
+            The cosine distance between the vectors of the two sentences.
+
+        """
+        return 1 - self.similarity(d1, d2)
+
+    def most_similar(
+        self,
+        positive: Union[int, ndarray] = None,
+        negative: Union[int, ndarray] = None,
+        indexable: Union[IndexedList, IndexedLineDocument] = None,
+        topn: int = 10,
+        restrict_size: Union[int, Tuple[int, int]] = None,
+    ) -> List[Tuple[int, float]]:
+
+        """Find the top-N most similar sentences.
+        Positive sentences contribute positively towards the similarity, negative sentences negatively.
+
+        This method computes cosine similarity between a simple mean of the projection
+        weight vectors of the given sentences and the vectors for each sentence in the model.
+
+        Parameters
+        ----------
+        positive : list of int, optional
+            List of indices that contribute positively.
+        negative : list of int, optional
+            List of indices that contribute negatively.
+        indexable: list, IndexedList, IndexedLineDocument
+            Provides an indexable object from where the most similar sentences are read
+        topn : int or None, optional
+            Number of top-N similar sentences to return, when `topn` is int. When `topn` is None,
+            then similarities for all sentences are returned.
+        restrict_size : int or Tuple(int,int), optional
+            Optional integer which limits the range of vectors which
+            are searched for most-similar values. For example, restrict_vocab=10000 would
+            only check the first 10000 sentence vectors.
+            restrict_vocab=(500, 1000) would search the sentence vectors with indices between
+            500 and 1000.
+
+        Returns
+        -------
+        list of (int, float) or list of (str, int, float)
+            A sequence of (index, similarity) is returned.
+            When an indexable is provided, returns (str, index, similarity)
+            When `topn` is None, then similarities for all words are returned as a
+            one-dimensional numpy array with the size of the vocabulary.
+
+        """
+        if indexable is not None and not hasattr(indexable, "__getitem__"):
+            raise RuntimeError("Indexable must provide __getitem__")
+        if positive is None:
+            positive = []
+        if negative is None:
+            negative = []
+
+        self.init_sims()
+
+        if isinstance(positive, (int, integer)) and not negative:
+            positive = [positive]
+        if isinstance(positive, (ndarray)) and not negative:
+            if len(positive.shape) == 1:
+                positive = [positive]
+
+        positive = [
+            (sent, 1.0) if isinstance(sent, (int, integer, ndarray)) else sent
+            for sent in positive
+        ]
+        negative = [
+            (sent, -1.0) if isinstance(sent, (int, integer, ndarray)) else sent
+            for sent in negative
+        ]
+
+        all_sents, mean = set(), []
+        for sent, weight in positive + negative:
+            if isinstance(sent, ndarray):
+                mean.append(weight * sent)
+            else:
+                mean.append(weight * self.get_vector(index=sent, use_norm=True))
+                if sent in self:
+                    all_sents.add(sent)
+        if not mean:
+            raise ValueError("cannot compute similarity with no input")
+        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
+
+        if isinstance(restrict_size, (int, integer)):
+            lo, hi = 0, restrict_size
+        elif isinstance(restrict_size, Tuple):
+            lo, hi = restrict_size
+        else:
+            lo, hi = 0, None
+
+        limited = (
+            self.vectors_norm if restrict_size is None else self.vectors_norm[lo:hi]
+        )
+        dists = dot(limited, mean)
+        if not topn:
+            return dists
+        best = matutils.argsort(dists, topn=topn + len(all_sents), reverse=True)
+        best_off = best + lo
+
+        if indexable is not None:
+            result = [
+                (indexable[off_idx], off_idx, float(dists[idx]))
+                for off_idx, idx in zip(best_off, best)
+                if off_idx not in all_sents
+            ]
+        else:
+            result = [
+                (off_idx, float(dists[idx]))
+                for off_idx, idx in zip(best_off, best)
+                if off_idx not in all_sents
+            ]
+        return result[:topn]
+
+    def similar_by_word(
+        self,
+        word: str,
+        wv: KeyedVectors,
+        indexable: Union[IndexedList, IndexedLineDocument] = None,
+        topn: int = 10,
+        restrict_size: Union[int, Tuple[int, int]] = None,
+    ) -> List[Tuple[int, float]]:
+
+        """Find the top-N most similar sentences to a given word.
+
+        Parameters
+        ----------
+        word : str
+            Word
+        wv : :class:`~gensim.models.keyedvectors.KeyedVectors`
+            This object essentially contains the mapping between words and embeddings.
+        indexable: list, IndexedList, IndexedLineDocument
+            Provides an indexable object from where the most similar sentences are read
+        topn : int or None, optional
+            Number of top-N similar sentences to return, when `topn` is int. When `topn` is None,
+            then similarities for all sentences are returned.
+        restrict_size : int or Tuple(int,int), optional
+            Optional integer which limits the range of vectors which
+            are searched for most-similar values. For example, restrict_vocab=10000 would
+            only check the first 10000 sentence vectors.
+            restrict_vocab=(500, 1000) would search the sentence vectors with indices between
+            500 and 1000.
+
+        Returns
+        -------
+        list of (int, float) or list of (str, int, float)
+            A sequence of (index, similarity) is returned.
+            When an indexable is provided, returns (str, index, similarity)
+            When `topn` is None, then similarities for all words are returned as a
+            one-dimensional numpy array with the size of the vocabulary.
+
+        """
+        return self.most_similar(
+            positive=wv[word],
+            indexable=indexable,
+            topn=topn,
+            restrict_size=restrict_size,
+        )
+
+    def similar_by_sentence(
+        self,
+        sentence: List[str],
+        model,
+        indexable: Union[IndexedList, IndexedLineDocument] = None,
+        topn: int = 10,
+        restrict_size: Union[int, Tuple[int, int]] = None,
+    ) -> List[Tuple[int, float]]:
+
+        """Find the top-N most similar sentences to a given sentence.
+
+        Parameters
+        ----------
+        sentence : list of str
+            Sentence as list of strings
+        model : :class:`~fse.models.base_s2v.BaseSentence2VecModel`
+            This object essentially provides the infer method used to transform .
+        indexable: list, IndexedList, IndexedLineDocument
+            Provides an indexable object from where the most similar sentences are read
+        topn : int or None, optional
+            Number of top-N similar sentences to return, when `topn` is int. When `topn` is None,
+            then similarities for all sentences are returned.
+        restrict_size : int or Tuple(int,int), optional
+            Optional integer which limits the range of vectors which
+            are searched for most-similar values. For example, restrict_vocab=10000 would
+            only check the first 10000 sentence vectors.
+            restrict_vocab=(500, 1000) would search the sentence vectors with indices between
+            500 and 1000.
+
+        Returns
+        -------
+        list of (int, float) or list of (str, int, float)
+            A sequence of (index, similarity) is returned.
+            When an indexable is provided, returns (str, index, similarity)
+            When `topn` is None, then similarities for all words are returned as a
+            one-dimensional numpy array with the size of the vocabulary.
+
+        """
+        infer_op = getattr(model, "infer", None)
+        if not callable(infer_op):
+            raise RuntimeError(
+                "Model does not have infer method. Make sure to pass a BaseSentence2VecModel"
+            )
+
+        vector = model.infer([(sentence, 0)])
+        return self.most_similar(
+            positive=vector, indexable=indexable, topn=topn, restrict_size=restrict_size
+        )
+
+    def similar_by_vector(
+        self,
+        vector: ndarray,
+        indexable: Union[IndexedList, IndexedLineDocument] = None,
+        topn: int = 10,
+        restrict_size: Union[int, Tuple[int, int]] = None,
+    ) -> List[Tuple[int, float]]:
+
+        """Find the top-N most similar sentences to a given vector.
+
+        Parameters
+        ----------
+        vector : ndarray
+            Vectors
+        indexable: list, IndexedList, IndexedLineDocument
+            Provides an indexable object from where the most similar sentences are read
+        topn : int or None, optional
+            Number of top-N similar sentences to return, when `topn` is int. When `topn` is None,
+            then similarities for all sentences are returned.
+        restrict_size : int or Tuple(int,int), optional
+            Optional integer which limits the range of vectors which
+            are searched for most-similar values. For example, restrict_vocab=10000 would
+            only check the first 10000 sentence vectors.
+            restrict_vocab=(500, 1000) would search the sentence vectors with indices between
+            500 and 1000.
+
+        Returns
+        -------
+        list of (int, float) or list of (str, int, float)
+            A sequence of (index, similarity) is returned.
+            When an indexable is provided, returns (str, index, similarity)
+            When `topn` is None, then similarities for all words are returned as a
+            one-dimensional numpy array with the size of the vocabulary.
+
+        """
+        return self.most_similar(
+            positive=vector, indexable=indexable, topn=topn, restrict_size=restrict_size
+        )
+
+
+def _l2_norm(m, replace=False):
+    """Return an L2-normalized version of a matrix.
+
+    Parameters
+    ----------
+    m : np.array
+        The matrix to normalize.
+    replace : boolean, optional
+        If True, modifies the existing matrix.
+
+    Returns
+    -------
+    The normalized matrix.  If replace=True, this will be the same as m.
+
+    NOTE: This part is copied from Gensim and modified as the call
+    m /= dist somtimes raises an exception and sometimes it does not.
+    """
+    dist = sqrt((m ** 2).sum(-1))[..., newaxis]
+    if replace:
+        m = m / dist
+        return m
+    else:
+        return (m / dist).astype(REAL)
+
+
+
+
+
+
+
+
+
+

Classes

+
+
+class SentenceVectors +(vector_size: int, mapfile_path: str = None) +
+
+

Serialize/deserialize objects from disk, by equipping them with the save() / load() methods.

+

Warnings

+

This uses pickle internally (among other techniques), so objects must not contain unpicklable attributes +such as lambda functions etc.

+
+ +Expand source code + +
class SentenceVectors(utils.SaveLoad):
+    def __init__(self, vector_size: int, mapfile_path: str = None):
+
+        set_madvise_for_mmap()
+
+        self.vector_size = vector_size  # Size of vectors
+        self.vectors = zeros((0, vector_size), REAL)  # Vectors for sentences
+        self.vectors_norm = None
+
+        # File for numpy memmap
+        self.mapfile_path = Path(mapfile_path) if mapfile_path is not None else None
+        self.mapfile_shape = None
+
+    def __getitem__(self, entities: int) -> ndarray:
+        """Get vector representation of `entities`.
+
+        Parameters
+        ----------
+        entities : {int, list of int}
+            Index or sequence of entities.
+
+        Returns
+        -------
+        numpy.ndarray
+            Vector representation for `entities` (1D if `entities` is int, otherwise - 2D).
+
+        """
+
+        if isinstance(
+            entities,
+            (
+                int,
+                integer,
+            ),
+        ):
+            return self.get_vector(entities)
+
+        return vstack([self.get_vector(e) for e in entities])
+
+    def __contains__(self, index: int) -> bool:
+        if isinstance(
+            index,
+            (
+                int,
+                integer,
+            ),
+        ):
+            return index < len(self)
+        else:
+            raise KeyError(f"index {index} is not a valid index")
+
+    def __len__(self) -> int:
+        return len(self.vectors)
+
+    def _load_all_vectors_from_disk(self, mapfile_path: Path):
+        """ Reads all vectors from disk """
+        path = str(mapfile_path.absolute())
+        self.vectors = np_memmap(
+            f"{path}.vectors", dtype=REAL, mode="r+", shape=self.mapfile_shape
+        )
+
+    def save(self, *args, **kwargs):
+        """Save object.
+
+        Parameters
+        ----------
+        fname : str
+            Path to the output file.
+
+        See Also
+        --------
+        :meth:`~gensim.models.keyedvectors.Doc2VecKeyedVectors.load`
+            Load object.
+
+        """
+        self.mapfile_shape = self.vectors.shape
+        ignore = ["vectors_norm"]
+        # don't bother storing the cached normalized vectors
+        if self.mapfile_path is not None:
+            ignore.append("vectors")
+        kwargs["ignore"] = kwargs.get("ignore", ignore)
+        super(SentenceVectors, self).save(*args, **kwargs)
+
+    @classmethod
+    def load(cls, fname_or_handle, **kwargs):
+        # TODO: Unittests
+        sv = super(SentenceVectors, cls).load(fname_or_handle, **kwargs)
+        path = sv.mapfile_path
+        if path is not None:
+            sv._load_all_vectors_from_disk(mapfile_path=path)
+        set_madvise_for_mmap()
+        return sv
+
+    def get_vector(self, index: int, use_norm: bool = False) -> ndarray:
+        """Get sentence representations in vector space, as a 1D numpy array.
+
+        Parameters
+        ----------
+        index : int
+            Input index
+        norm : bool, optional
+            If True - resulting vector will be L2-normalized (unit euclidean length).
+
+        Returns
+        -------
+        numpy.ndarray
+            Vector representation of index.
+
+        Raises
+        ------
+        KeyError
+            If index out of bounds.
+
+        """
+        if index in self:
+            if use_norm:
+                result = self.vectors_norm[index]
+            else:
+                result = self.vectors[index]
+
+            result.setflags(write=False)
+            return result
+        else:
+            raise KeyError("index {index} not found")
+
+    def init_sims(self, replace: bool = False):
+        """Precompute L2-normalized vectors.
+
+        Parameters
+        ----------
+        replace : bool, optional
+            If True - forget the original vectors and only keep the normalized ones = saves lots of memory!
+        """
+        if getattr(self, "vectors_norm", None) is None or replace:
+            logger.info("precomputing L2-norms of sentence vectors")
+            if not replace and self.mapfile_path is not None:
+                self.vectors_norm = np_memmap(
+                    self.mapfile_path + ".vectors_norm",
+                    dtype=REAL,
+                    mode="w+",
+                    shape=self.vectors.shape,
+                )
+            self.vectors_norm = _l2_norm(self.vectors, replace=replace)
+
+    def similarity(self, d1: int, d2: int) -> float:
+        """Compute cosine similarity between two sentences from the training set.
+
+        Parameters
+        ----------
+        d1 : int
+            index of sentence
+        d2 : int
+            index of sentence
+
+        Returns
+        -------
+        float
+            The cosine similarity between the vectors of the two sentences.
+
+        """
+        return dot(matutils.unitvec(self[d1]), matutils.unitvec(self[d2]))
+
+    def distance(self, d1: int, d2: int) -> float:
+        """Compute cosine similarity between two sentences from the training set.
+
+        Parameters
+        ----------
+        d1 : int
+            index of sentence
+        d2 : int
+            index of sentence
+
+        Returns
+        -------
+        float
+            The cosine distance between the vectors of the two sentences.
+
+        """
+        return 1 - self.similarity(d1, d2)
+
+    def most_similar(
+        self,
+        positive: Union[int, ndarray] = None,
+        negative: Union[int, ndarray] = None,
+        indexable: Union[IndexedList, IndexedLineDocument] = None,
+        topn: int = 10,
+        restrict_size: Union[int, Tuple[int, int]] = None,
+    ) -> List[Tuple[int, float]]:
+
+        """Find the top-N most similar sentences.
+        Positive sentences contribute positively towards the similarity, negative sentences negatively.
+
+        This method computes cosine similarity between a simple mean of the projection
+        weight vectors of the given sentences and the vectors for each sentence in the model.
+
+        Parameters
+        ----------
+        positive : list of int, optional
+            List of indices that contribute positively.
+        negative : list of int, optional
+            List of indices that contribute negatively.
+        indexable: list, IndexedList, IndexedLineDocument
+            Provides an indexable object from where the most similar sentences are read
+        topn : int or None, optional
+            Number of top-N similar sentences to return, when `topn` is int. When `topn` is None,
+            then similarities for all sentences are returned.
+        restrict_size : int or Tuple(int,int), optional
+            Optional integer which limits the range of vectors which
+            are searched for most-similar values. For example, restrict_vocab=10000 would
+            only check the first 10000 sentence vectors.
+            restrict_vocab=(500, 1000) would search the sentence vectors with indices between
+            500 and 1000.
+
+        Returns
+        -------
+        list of (int, float) or list of (str, int, float)
+            A sequence of (index, similarity) is returned.
+            When an indexable is provided, returns (str, index, similarity)
+            When `topn` is None, then similarities for all words are returned as a
+            one-dimensional numpy array with the size of the vocabulary.
+
+        """
+        if indexable is not None and not hasattr(indexable, "__getitem__"):
+            raise RuntimeError("Indexable must provide __getitem__")
+        if positive is None:
+            positive = []
+        if negative is None:
+            negative = []
+
+        self.init_sims()
+
+        if isinstance(positive, (int, integer)) and not negative:
+            positive = [positive]
+        if isinstance(positive, (ndarray)) and not negative:
+            if len(positive.shape) == 1:
+                positive = [positive]
+
+        positive = [
+            (sent, 1.0) if isinstance(sent, (int, integer, ndarray)) else sent
+            for sent in positive
+        ]
+        negative = [
+            (sent, -1.0) if isinstance(sent, (int, integer, ndarray)) else sent
+            for sent in negative
+        ]
+
+        all_sents, mean = set(), []
+        for sent, weight in positive + negative:
+            if isinstance(sent, ndarray):
+                mean.append(weight * sent)
+            else:
+                mean.append(weight * self.get_vector(index=sent, use_norm=True))
+                if sent in self:
+                    all_sents.add(sent)
+        if not mean:
+            raise ValueError("cannot compute similarity with no input")
+        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
+
+        if isinstance(restrict_size, (int, integer)):
+            lo, hi = 0, restrict_size
+        elif isinstance(restrict_size, Tuple):
+            lo, hi = restrict_size
+        else:
+            lo, hi = 0, None
+
+        limited = (
+            self.vectors_norm if restrict_size is None else self.vectors_norm[lo:hi]
+        )
+        dists = dot(limited, mean)
+        if not topn:
+            return dists
+        best = matutils.argsort(dists, topn=topn + len(all_sents), reverse=True)
+        best_off = best + lo
+
+        if indexable is not None:
+            result = [
+                (indexable[off_idx], off_idx, float(dists[idx]))
+                for off_idx, idx in zip(best_off, best)
+                if off_idx not in all_sents
+            ]
+        else:
+            result = [
+                (off_idx, float(dists[idx]))
+                for off_idx, idx in zip(best_off, best)
+                if off_idx not in all_sents
+            ]
+        return result[:topn]
+
+    def similar_by_word(
+        self,
+        word: str,
+        wv: KeyedVectors,
+        indexable: Union[IndexedList, IndexedLineDocument] = None,
+        topn: int = 10,
+        restrict_size: Union[int, Tuple[int, int]] = None,
+    ) -> List[Tuple[int, float]]:
+
+        """Find the top-N most similar sentences to a given word.
+
+        Parameters
+        ----------
+        word : str
+            Word
+        wv : :class:`~gensim.models.keyedvectors.KeyedVectors`
+            This object essentially contains the mapping between words and embeddings.
+        indexable: list, IndexedList, IndexedLineDocument
+            Provides an indexable object from where the most similar sentences are read
+        topn : int or None, optional
+            Number of top-N similar sentences to return, when `topn` is int. When `topn` is None,
+            then similarities for all sentences are returned.
+        restrict_size : int or Tuple(int,int), optional
+            Optional integer which limits the range of vectors which
+            are searched for most-similar values. For example, restrict_vocab=10000 would
+            only check the first 10000 sentence vectors.
+            restrict_vocab=(500, 1000) would search the sentence vectors with indices between
+            500 and 1000.
+
+        Returns
+        -------
+        list of (int, float) or list of (str, int, float)
+            A sequence of (index, similarity) is returned.
+            When an indexable is provided, returns (str, index, similarity)
+            When `topn` is None, then similarities for all words are returned as a
+            one-dimensional numpy array with the size of the vocabulary.
+
+        """
+        return self.most_similar(
+            positive=wv[word],
+            indexable=indexable,
+            topn=topn,
+            restrict_size=restrict_size,
+        )
+
+    def similar_by_sentence(
+        self,
+        sentence: List[str],
+        model,
+        indexable: Union[IndexedList, IndexedLineDocument] = None,
+        topn: int = 10,
+        restrict_size: Union[int, Tuple[int, int]] = None,
+    ) -> List[Tuple[int, float]]:
+
+        """Find the top-N most similar sentences to a given sentence.
+
+        Parameters
+        ----------
+        sentence : list of str
+            Sentence as list of strings
+        model : :class:`~fse.models.base_s2v.BaseSentence2VecModel`
+            This object essentially provides the infer method used to transform .
+        indexable: list, IndexedList, IndexedLineDocument
+            Provides an indexable object from where the most similar sentences are read
+        topn : int or None, optional
+            Number of top-N similar sentences to return, when `topn` is int. When `topn` is None,
+            then similarities for all sentences are returned.
+        restrict_size : int or Tuple(int,int), optional
+            Optional integer which limits the range of vectors which
+            are searched for most-similar values. For example, restrict_vocab=10000 would
+            only check the first 10000 sentence vectors.
+            restrict_vocab=(500, 1000) would search the sentence vectors with indices between
+            500 and 1000.
+
+        Returns
+        -------
+        list of (int, float) or list of (str, int, float)
+            A sequence of (index, similarity) is returned.
+            When an indexable is provided, returns (str, index, similarity)
+            When `topn` is None, then similarities for all words are returned as a
+            one-dimensional numpy array with the size of the vocabulary.
+
+        """
+        infer_op = getattr(model, "infer", None)
+        if not callable(infer_op):
+            raise RuntimeError(
+                "Model does not have infer method. Make sure to pass a BaseSentence2VecModel"
+            )
+
+        vector = model.infer([(sentence, 0)])
+        return self.most_similar(
+            positive=vector, indexable=indexable, topn=topn, restrict_size=restrict_size
+        )
+
+    def similar_by_vector(
+        self,
+        vector: ndarray,
+        indexable: Union[IndexedList, IndexedLineDocument] = None,
+        topn: int = 10,
+        restrict_size: Union[int, Tuple[int, int]] = None,
+    ) -> List[Tuple[int, float]]:
+
+        """Find the top-N most similar sentences to a given vector.
+
+        Parameters
+        ----------
+        vector : ndarray
+            Vectors
+        indexable: list, IndexedList, IndexedLineDocument
+            Provides an indexable object from where the most similar sentences are read
+        topn : int or None, optional
+            Number of top-N similar sentences to return, when `topn` is int. When `topn` is None,
+            then similarities for all sentences are returned.
+        restrict_size : int or Tuple(int,int), optional
+            Optional integer which limits the range of vectors which
+            are searched for most-similar values. For example, restrict_vocab=10000 would
+            only check the first 10000 sentence vectors.
+            restrict_vocab=(500, 1000) would search the sentence vectors with indices between
+            500 and 1000.
+
+        Returns
+        -------
+        list of (int, float) or list of (str, int, float)
+            A sequence of (index, similarity) is returned.
+            When an indexable is provided, returns (str, index, similarity)
+            When `topn` is None, then similarities for all words are returned as a
+            one-dimensional numpy array with the size of the vocabulary.
+
+        """
+        return self.most_similar(
+            positive=vector, indexable=indexable, topn=topn, restrict_size=restrict_size
+        )
+
+

Ancestors

+
    +
  • gensim.utils.SaveLoad
  • +
+

Static methods

+
+
+def load(fname_or_handle, **kwargs) +
+
+

Load an object previously saved using :meth:~gensim.utils.SaveLoad.save from a file.

+

Parameters

+
+
fname : str
+
Path to file that contains needed object.
+
mmap : str, optional
+
Memory-map option. +If the object was saved with large arrays stored separately, you can load these arrays +via mmap (shared memory) using mmap='r'. +If the file being loaded is compressed (either '.gz' or '.bz2'), thenmmap=None` must be set.
+
+

See Also

+

:meth:~gensim.utils.SaveLoad.save Save object to file.

+

Returns

+
+
object
+
Object loaded from fname.
+
+

Raises

+
+
AttributeError
+
When called on an object instance instead of class (this is a class method).
+
+
+ +Expand source code + +
@classmethod
+def load(cls, fname_or_handle, **kwargs):
+    # TODO: Unittests
+    sv = super(SentenceVectors, cls).load(fname_or_handle, **kwargs)
+    path = sv.mapfile_path
+    if path is not None:
+        sv._load_all_vectors_from_disk(mapfile_path=path)
+    set_madvise_for_mmap()
+    return sv
+
+
+
+

Methods

+
+
+def distance(self, d1: int, d2: int) ‑> float +
+
+

Compute cosine similarity between two sentences from the training set.

+

Parameters

+
+
d1 : int
+
index of sentence
+
d2 : int
+
index of sentence
+
+

Returns

+
+
float
+
The cosine distance between the vectors of the two sentences.
+
+
+ +Expand source code + +
def distance(self, d1: int, d2: int) -> float:
+    """Compute cosine similarity between two sentences from the training set.
+
+    Parameters
+    ----------
+    d1 : int
+        index of sentence
+    d2 : int
+        index of sentence
+
+    Returns
+    -------
+    float
+        The cosine distance between the vectors of the two sentences.
+
+    """
+    return 1 - self.similarity(d1, d2)
+
+
+
+def get_vector(self, index: int, use_norm: bool = False) ‑> numpy.ndarray +
+
+

Get sentence representations in vector space, as a 1D numpy array.

+

Parameters

+
+
index : int
+
Input index
+
norm : bool, optional
+
If True - resulting vector will be L2-normalized (unit euclidean length).
+
+

Returns

+
+
numpy.ndarray
+
Vector representation of index.
+
+

Raises

+
+
KeyError
+
If index out of bounds.
+
+
+ +Expand source code + +
def get_vector(self, index: int, use_norm: bool = False) -> ndarray:
+    """Get sentence representations in vector space, as a 1D numpy array.
+
+    Parameters
+    ----------
+    index : int
+        Input index
+    norm : bool, optional
+        If True - resulting vector will be L2-normalized (unit euclidean length).
+
+    Returns
+    -------
+    numpy.ndarray
+        Vector representation of index.
+
+    Raises
+    ------
+    KeyError
+        If index out of bounds.
+
+    """
+    if index in self:
+        if use_norm:
+            result = self.vectors_norm[index]
+        else:
+            result = self.vectors[index]
+
+        result.setflags(write=False)
+        return result
+    else:
+        raise KeyError("index {index} not found")
+
+
+
+def init_sims(self, replace: bool = False) +
+
+

Precompute L2-normalized vectors.

+

Parameters

+
+
replace : bool, optional
+
If True - forget the original vectors and only keep the normalized ones = saves lots of memory!
+
+
+ +Expand source code + +
def init_sims(self, replace: bool = False):
+    """Precompute L2-normalized vectors.
+
+    Parameters
+    ----------
+    replace : bool, optional
+        If True - forget the original vectors and only keep the normalized ones = saves lots of memory!
+    """
+    if getattr(self, "vectors_norm", None) is None or replace:
+        logger.info("precomputing L2-norms of sentence vectors")
+        if not replace and self.mapfile_path is not None:
+            self.vectors_norm = np_memmap(
+                self.mapfile_path + ".vectors_norm",
+                dtype=REAL,
+                mode="w+",
+                shape=self.vectors.shape,
+            )
+        self.vectors_norm = _l2_norm(self.vectors, replace=replace)
+
+
+
+def most_similar(self, positive: Union[int, numpy.ndarray] = None, negative: Union[int, numpy.ndarray] = None, indexable: Union[IndexedListIndexedLineDocument] = None, topn: int = 10, restrict_size: Union[int, Tuple[int, int]] = None) ‑> List[Tuple[int, float]] +
+
+

Find the top-N most similar sentences. +Positive sentences contribute positively towards the similarity, negative sentences negatively.

+

This method computes cosine similarity between a simple mean of the projection +weight vectors of the given sentences and the vectors for each sentence in the model.

+

Parameters

+
+
positive : list of int, optional
+
List of indices that contribute positively.
+
negative : list of int, optional
+
List of indices that contribute negatively.
+
indexable : list, IndexedList, IndexedLineDocument
+
Provides an indexable object from where the most similar sentences are read
+
topn : int or None, optional
+
Number of top-N similar sentences to return, when topn is int. When topn is None, +then similarities for all sentences are returned.
+
restrict_size : int or Tuple(int,int), optional
+
Optional integer which limits the range of vectors which +are searched for most-similar values. For example, restrict_vocab=10000 would +only check the first 10000 sentence vectors. +restrict_vocab=(500, 1000) would search the sentence vectors with indices between +500 and 1000.
+
+

Returns

+
+
list of (int, float) or list of (str, int, float)
+
A sequence of (index, similarity) is returned. +When an indexable is provided, returns (str, index, similarity) +When topn is None, then similarities for all words are returned as a +one-dimensional numpy array with the size of the vocabulary.
+
+
+ +Expand source code + +
def most_similar(
+    self,
+    positive: Union[int, ndarray] = None,
+    negative: Union[int, ndarray] = None,
+    indexable: Union[IndexedList, IndexedLineDocument] = None,
+    topn: int = 10,
+    restrict_size: Union[int, Tuple[int, int]] = None,
+) -> List[Tuple[int, float]]:
+
+    """Find the top-N most similar sentences.
+    Positive sentences contribute positively towards the similarity, negative sentences negatively.
+
+    This method computes cosine similarity between a simple mean of the projection
+    weight vectors of the given sentences and the vectors for each sentence in the model.
+
+    Parameters
+    ----------
+    positive : list of int, optional
+        List of indices that contribute positively.
+    negative : list of int, optional
+        List of indices that contribute negatively.
+    indexable: list, IndexedList, IndexedLineDocument
+        Provides an indexable object from where the most similar sentences are read
+    topn : int or None, optional
+        Number of top-N similar sentences to return, when `topn` is int. When `topn` is None,
+        then similarities for all sentences are returned.
+    restrict_size : int or Tuple(int,int), optional
+        Optional integer which limits the range of vectors which
+        are searched for most-similar values. For example, restrict_vocab=10000 would
+        only check the first 10000 sentence vectors.
+        restrict_vocab=(500, 1000) would search the sentence vectors with indices between
+        500 and 1000.
+
+    Returns
+    -------
+    list of (int, float) or list of (str, int, float)
+        A sequence of (index, similarity) is returned.
+        When an indexable is provided, returns (str, index, similarity)
+        When `topn` is None, then similarities for all words are returned as a
+        one-dimensional numpy array with the size of the vocabulary.
+
+    """
+    if indexable is not None and not hasattr(indexable, "__getitem__"):
+        raise RuntimeError("Indexable must provide __getitem__")
+    if positive is None:
+        positive = []
+    if negative is None:
+        negative = []
+
+    self.init_sims()
+
+    if isinstance(positive, (int, integer)) and not negative:
+        positive = [positive]
+    if isinstance(positive, (ndarray)) and not negative:
+        if len(positive.shape) == 1:
+            positive = [positive]
+
+    positive = [
+        (sent, 1.0) if isinstance(sent, (int, integer, ndarray)) else sent
+        for sent in positive
+    ]
+    negative = [
+        (sent, -1.0) if isinstance(sent, (int, integer, ndarray)) else sent
+        for sent in negative
+    ]
+
+    all_sents, mean = set(), []
+    for sent, weight in positive + negative:
+        if isinstance(sent, ndarray):
+            mean.append(weight * sent)
+        else:
+            mean.append(weight * self.get_vector(index=sent, use_norm=True))
+            if sent in self:
+                all_sents.add(sent)
+    if not mean:
+        raise ValueError("cannot compute similarity with no input")
+    mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
+
+    if isinstance(restrict_size, (int, integer)):
+        lo, hi = 0, restrict_size
+    elif isinstance(restrict_size, Tuple):
+        lo, hi = restrict_size
+    else:
+        lo, hi = 0, None
+
+    limited = (
+        self.vectors_norm if restrict_size is None else self.vectors_norm[lo:hi]
+    )
+    dists = dot(limited, mean)
+    if not topn:
+        return dists
+    best = matutils.argsort(dists, topn=topn + len(all_sents), reverse=True)
+    best_off = best + lo
+
+    if indexable is not None:
+        result = [
+            (indexable[off_idx], off_idx, float(dists[idx]))
+            for off_idx, idx in zip(best_off, best)
+            if off_idx not in all_sents
+        ]
+    else:
+        result = [
+            (off_idx, float(dists[idx]))
+            for off_idx, idx in zip(best_off, best)
+            if off_idx not in all_sents
+        ]
+    return result[:topn]
+
+
+
+def save(self, *args, **kwargs) +
+
+

Save object.

+

Parameters

+
+
fname : str
+
Path to the output file.
+
+

See Also

+

:meth:~gensim.models.keyedvectors.Doc2VecKeyedVectors.load Load object.

+
+ +Expand source code + +
def save(self, *args, **kwargs):
+    """Save object.
+
+    Parameters
+    ----------
+    fname : str
+        Path to the output file.
+
+    See Also
+    --------
+    :meth:`~gensim.models.keyedvectors.Doc2VecKeyedVectors.load`
+        Load object.
+
+    """
+    self.mapfile_shape = self.vectors.shape
+    ignore = ["vectors_norm"]
+    # don't bother storing the cached normalized vectors
+    if self.mapfile_path is not None:
+        ignore.append("vectors")
+    kwargs["ignore"] = kwargs.get("ignore", ignore)
+    super(SentenceVectors, self).save(*args, **kwargs)
+
+
+
+def similar_by_sentence(self, sentence: List[str], model, indexable: Union[IndexedListIndexedLineDocument] = None, topn: int = 10, restrict_size: Union[int, Tuple[int, int]] = None) ‑> List[Tuple[int, float]] +
+
+

Find the top-N most similar sentences to a given sentence.

+

Parameters

+
+
sentence : list of str
+
Sentence as list of strings
+
model : :class:~fse.models.base_s2v.BaseSentence2VecModel``
+
This object essentially provides the infer method used to transform .
+
indexable : list, IndexedList, IndexedLineDocument
+
Provides an indexable object from where the most similar sentences are read
+
topn : int or None, optional
+
Number of top-N similar sentences to return, when topn is int. When topn is None, +then similarities for all sentences are returned.
+
restrict_size : int or Tuple(int,int), optional
+
Optional integer which limits the range of vectors which +are searched for most-similar values. For example, restrict_vocab=10000 would +only check the first 10000 sentence vectors. +restrict_vocab=(500, 1000) would search the sentence vectors with indices between +500 and 1000.
+
+

Returns

+
+
list of (int, float) or list of (str, int, float)
+
A sequence of (index, similarity) is returned. +When an indexable is provided, returns (str, index, similarity) +When topn is None, then similarities for all words are returned as a +one-dimensional numpy array with the size of the vocabulary.
+
+
+ +Expand source code + +
def similar_by_sentence(
+    self,
+    sentence: List[str],
+    model,
+    indexable: Union[IndexedList, IndexedLineDocument] = None,
+    topn: int = 10,
+    restrict_size: Union[int, Tuple[int, int]] = None,
+) -> List[Tuple[int, float]]:
+
+    """Find the top-N most similar sentences to a given sentence.
+
+    Parameters
+    ----------
+    sentence : list of str
+        Sentence as list of strings
+    model : :class:`~fse.models.base_s2v.BaseSentence2VecModel`
+        This object essentially provides the infer method used to transform .
+    indexable: list, IndexedList, IndexedLineDocument
+        Provides an indexable object from where the most similar sentences are read
+    topn : int or None, optional
+        Number of top-N similar sentences to return, when `topn` is int. When `topn` is None,
+        then similarities for all sentences are returned.
+    restrict_size : int or Tuple(int,int), optional
+        Optional integer which limits the range of vectors which
+        are searched for most-similar values. For example, restrict_vocab=10000 would
+        only check the first 10000 sentence vectors.
+        restrict_vocab=(500, 1000) would search the sentence vectors with indices between
+        500 and 1000.
+
+    Returns
+    -------
+    list of (int, float) or list of (str, int, float)
+        A sequence of (index, similarity) is returned.
+        When an indexable is provided, returns (str, index, similarity)
+        When `topn` is None, then similarities for all words are returned as a
+        one-dimensional numpy array with the size of the vocabulary.
+
+    """
+    infer_op = getattr(model, "infer", None)
+    if not callable(infer_op):
+        raise RuntimeError(
+            "Model does not have infer method. Make sure to pass a BaseSentence2VecModel"
+        )
+
+    vector = model.infer([(sentence, 0)])
+    return self.most_similar(
+        positive=vector, indexable=indexable, topn=topn, restrict_size=restrict_size
+    )
+
+
+
+def similar_by_vector(self, vector: numpy.ndarray, indexable: Union[IndexedListIndexedLineDocument] = None, topn: int = 10, restrict_size: Union[int, Tuple[int, int]] = None) ‑> List[Tuple[int, float]] +
+
+

Find the top-N most similar sentences to a given vector.

+

Parameters

+
+
vector : ndarray
+
Vectors
+
indexable : list, IndexedList, IndexedLineDocument
+
Provides an indexable object from where the most similar sentences are read
+
topn : int or None, optional
+
Number of top-N similar sentences to return, when topn is int. When topn is None, +then similarities for all sentences are returned.
+
restrict_size : int or Tuple(int,int), optional
+
Optional integer which limits the range of vectors which +are searched for most-similar values. For example, restrict_vocab=10000 would +only check the first 10000 sentence vectors. +restrict_vocab=(500, 1000) would search the sentence vectors with indices between +500 and 1000.
+
+

Returns

+
+
list of (int, float) or list of (str, int, float)
+
A sequence of (index, similarity) is returned. +When an indexable is provided, returns (str, index, similarity) +When topn is None, then similarities for all words are returned as a +one-dimensional numpy array with the size of the vocabulary.
+
+
+ +Expand source code + +
def similar_by_vector(
+    self,
+    vector: ndarray,
+    indexable: Union[IndexedList, IndexedLineDocument] = None,
+    topn: int = 10,
+    restrict_size: Union[int, Tuple[int, int]] = None,
+) -> List[Tuple[int, float]]:
+
+    """Find the top-N most similar sentences to a given vector.
+
+    Parameters
+    ----------
+    vector : ndarray
+        Vectors
+    indexable: list, IndexedList, IndexedLineDocument
+        Provides an indexable object from where the most similar sentences are read
+    topn : int or None, optional
+        Number of top-N similar sentences to return, when `topn` is int. When `topn` is None,
+        then similarities for all sentences are returned.
+    restrict_size : int or Tuple(int,int), optional
+        Optional integer which limits the range of vectors which
+        are searched for most-similar values. For example, restrict_vocab=10000 would
+        only check the first 10000 sentence vectors.
+        restrict_vocab=(500, 1000) would search the sentence vectors with indices between
+        500 and 1000.
+
+    Returns
+    -------
+    list of (int, float) or list of (str, int, float)
+        A sequence of (index, similarity) is returned.
+        When an indexable is provided, returns (str, index, similarity)
+        When `topn` is None, then similarities for all words are returned as a
+        one-dimensional numpy array with the size of the vocabulary.
+
+    """
+    return self.most_similar(
+        positive=vector, indexable=indexable, topn=topn, restrict_size=restrict_size
+    )
+
+
+
+def similar_by_word(self, word: str, wv: gensim.models.keyedvectors.KeyedVectors, indexable: Union[IndexedListIndexedLineDocument] = None, topn: int = 10, restrict_size: Union[int, Tuple[int, int]] = None) ‑> List[Tuple[int, float]] +
+
+

Find the top-N most similar sentences to a given word.

+

Parameters

+
+
word : str
+
Word
+
wv : :class:~gensim.models.keyedvectors.KeyedVectors``
+
This object essentially contains the mapping between words and embeddings.
+
indexable : list, IndexedList, IndexedLineDocument
+
Provides an indexable object from where the most similar sentences are read
+
topn : int or None, optional
+
Number of top-N similar sentences to return, when topn is int. When topn is None, +then similarities for all sentences are returned.
+
restrict_size : int or Tuple(int,int), optional
+
Optional integer which limits the range of vectors which +are searched for most-similar values. For example, restrict_vocab=10000 would +only check the first 10000 sentence vectors. +restrict_vocab=(500, 1000) would search the sentence vectors with indices between +500 and 1000.
+
+

Returns

+
+
list of (int, float) or list of (str, int, float)
+
A sequence of (index, similarity) is returned. +When an indexable is provided, returns (str, index, similarity) +When topn is None, then similarities for all words are returned as a +one-dimensional numpy array with the size of the vocabulary.
+
+
+ +Expand source code + +
def similar_by_word(
+    self,
+    word: str,
+    wv: KeyedVectors,
+    indexable: Union[IndexedList, IndexedLineDocument] = None,
+    topn: int = 10,
+    restrict_size: Union[int, Tuple[int, int]] = None,
+) -> List[Tuple[int, float]]:
+
+    """Find the top-N most similar sentences to a given word.
+
+    Parameters
+    ----------
+    word : str
+        Word
+    wv : :class:`~gensim.models.keyedvectors.KeyedVectors`
+        This object essentially contains the mapping between words and embeddings.
+    indexable: list, IndexedList, IndexedLineDocument
+        Provides an indexable object from where the most similar sentences are read
+    topn : int or None, optional
+        Number of top-N similar sentences to return, when `topn` is int. When `topn` is None,
+        then similarities for all sentences are returned.
+    restrict_size : int or Tuple(int,int), optional
+        Optional integer which limits the range of vectors which
+        are searched for most-similar values. For example, restrict_vocab=10000 would
+        only check the first 10000 sentence vectors.
+        restrict_vocab=(500, 1000) would search the sentence vectors with indices between
+        500 and 1000.
+
+    Returns
+    -------
+    list of (int, float) or list of (str, int, float)
+        A sequence of (index, similarity) is returned.
+        When an indexable is provided, returns (str, index, similarity)
+        When `topn` is None, then similarities for all words are returned as a
+        one-dimensional numpy array with the size of the vocabulary.
+
+    """
+    return self.most_similar(
+        positive=wv[word],
+        indexable=indexable,
+        topn=topn,
+        restrict_size=restrict_size,
+    )
+
+
+
+def similarity(self, d1: int, d2: int) ‑> float +
+
+

Compute cosine similarity between two sentences from the training set.

+

Parameters

+
+
d1 : int
+
index of sentence
+
d2 : int
+
index of sentence
+
+

Returns

+
+
float
+
The cosine similarity between the vectors of the two sentences.
+
+
+ +Expand source code + +
def similarity(self, d1: int, d2: int) -> float:
+    """Compute cosine similarity between two sentences from the training set.
+
+    Parameters
+    ----------
+    d1 : int
+        index of sentence
+    d2 : int
+        index of sentence
+
+    Returns
+    -------
+    float
+        The cosine similarity between the vectors of the two sentences.
+
+    """
+    return dot(matutils.unitvec(self[d1]), matutils.unitvec(self[d2]))
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/fse/models/sif.html b/docs/fse/models/sif.html new file mode 100644 index 0000000..6ed5104 --- /dev/null +++ b/docs/fse/models/sif.html @@ -0,0 +1,438 @@ + + + + + + +fse.models.sif API documentation + + + + + + + + + + + +
+
+
+

Module fse.models.sif

+
+
+
+ +Expand source code + +
#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Author: Oliver Borchers
+# Copyright (C) Oliver Borchers
+
+from fse.models.average import Average
+from fse.models.utils import compute_principal_components, remove_principal_components
+
+from gensim.models.keyedvectors import KeyedVectors
+
+from numpy import ndarray, float32 as REAL, zeros, isfinite
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class SIF(Average):
+    def __init__(
+        self,
+        model: KeyedVectors,
+        alpha: float = 1e-3,
+        components: int = 1,
+        cache_size_gb: float = 1.0,
+        sv_mapfile_path: str = None,
+        wv_mapfile_path: str = None,
+        workers: int = 1,
+        lang_freq: str = None,
+    ):
+        """Smooth-inverse frequency (SIF) weighted sentence embeddings model. Performs a weighted averaging operation over all
+        words in a sentences. After training, the model removes a number of singular vectors.
+
+        The implementation is based on Arora et al. (2017): A Simple but Tough-to-Beat Baseline for Sentence Embeddings.
+        For more information, see <https://openreview.net/pdf?id=SyK00v5xx> and <https://github.com/PrincetonML/SIF>
+
+        Parameters
+        ----------
+        model : :class:`~gensim.models.keyedvectors.KeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
+            This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings
+            the wv.vocab and wv.vector elements are required.
+        alpha : float, optional
+            Alpha is the weighting factor used to downweigh each individual word.
+        components : int, optional
+            Corresponds to the number of singular vectors to remove from the sentence embeddings.
+        cache_size_gb : float, optional
+            Cache size for computing the singular vectors in GB.
+        sv_mapfile_path : str, optional
+            Optional path to store the sentence-vectors in for very large datasets. Used for memmap.
+        wv_mapfile_path : str, optional
+            Optional path to store the word-vectors in for very large datasets. Used for memmap.
+            Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram.
+        workers : int, optional
+            Number of working threads, used for multithreading. For most tasks (few words in a sentence)
+            a value of 1 should be more than enough.
+        lang_freq : str, optional
+            Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about
+            the frequency of a word. As the frequency is required for estimating the word weights, we induce
+            frequencies into the wv.vocab.count based on :class:`~wordfreq`
+            If no frequency information is available, you can choose the language to estimate the frequency.
+            See https://github.com/LuminosoInsight/wordfreq
+
+        """
+
+        self.alpha = float(alpha)
+        self.components = int(components)
+        self.cache_size_gb = float(cache_size_gb)
+        self.svd_res = None
+
+        if lang_freq is None:
+            logger.info(
+                "make sure you are using a model with valid word-frequency information. Otherwise use lang_freq argument."
+            )
+
+        super(SIF, self).__init__(
+            model=model,
+            sv_mapfile_path=sv_mapfile_path,
+            wv_mapfile_path=wv_mapfile_path,
+            workers=workers,
+            lang_freq=lang_freq,
+        )
+
+    def _check_parameter_sanity(self):
+        """ Check the sanity of all paramters """
+        if not all(self.word_weights <= 1.0) or not all(self.word_weights >= 0.0):
+            raise ValueError("For SIF, all word weights must be 0 <= w_weight <= 1")
+        if self.alpha <= 0.0:
+            raise ValueError("Alpha must be greater than zero.")
+        if self.components < 0.0:
+            raise ValueError("Components must be greater or equal zero")
+
+    def _pre_train_calls(self, **kwargs):
+        """Function calls to perform before training """
+        self._compute_sif_weights()
+
+    def _post_train_calls(self):
+        """ Function calls to perform after training, such as computing eigenvectors """
+        if self.components > 0:
+            self.svd_res = compute_principal_components(
+                self.sv.vectors,
+                components=self.components,
+                cache_size_gb=self.cache_size_gb,
+            )
+            remove_principal_components(
+                self.sv.vectors, svd_res=self.svd_res, inplace=True
+            )
+        else:
+            self.svd_res = 0
+            logger.info(f"no removal of principal components")
+
+    def _post_inference_calls(self, output: ndarray, **kwargs):
+        """ Function calls to perform after training & inference """
+        if self.svd_res is None:
+            raise RuntimeError(
+                "You must first train the model to obtain SVD components"
+            )
+        elif self.components > 0:
+            remove_principal_components(output, svd_res=self.svd_res, inplace=True)
+        else:
+            logger.info(f"no removal of principal components")
+
+    def _check_dtype_santiy(self):
+        """ Check the dtypes of all attributes """
+        if self.word_weights.dtype != REAL:
+            raise TypeError(f"type of word_weights is wrong: {self.word_weights.dtype}")
+        if self.svd_res is not None:
+            if self.svd_res[0].dtype != REAL:
+                raise TypeError(f"type of svd values is wrong: {self.svd_res[0].dtype}")
+            if self.svd_res[1].dtype != REAL:
+                raise TypeError(
+                    f"type of svd components is wrong: {self.svd_res[1].dtype}"
+                )
+
+    def _compute_sif_weights(self):
+        """ Precomputes the SIF weights for all words in the vocabulary """
+        logger.info(f"pre-computing SIF weights for {len(self.wv)} words")
+        v = len(self.wv)
+        corpus_size = 0
+
+        pw = zeros(v, dtype=REAL)
+        for word in self.wv.key_to_index:
+            c = self.wv.get_vecattr(word, "count")
+            if c < 0:
+                raise ValueError("vocab count is negative")
+            corpus_size += c
+            pw[self.wv.key_to_index[word]] = c
+        pw /= corpus_size
+
+        self.word_weights = (self.alpha / (self.alpha + pw)).astype(REAL)
+
+        if not all(isfinite(self.word_weights)) or any(self.word_weights < 0):
+            raise RuntimeError(
+                "Encountered nan values. "
+                "This likely happens because the word frequency information is wrong/missing. "
+                "Consider restarting using lang_freq argument to infer frequency. "
+            )
+
+
+
+
+
+
+
+
+
+

Classes

+
+
+class SIF +(model: gensim.models.keyedvectors.KeyedVectors, alpha: float = 0.001, components: int = 1, cache_size_gb: float = 1.0, sv_mapfile_path: str = None, wv_mapfile_path: str = None, workers: int = 1, lang_freq: str = None) +
+
+

Train, use and evaluate averaged sentence vectors.

+

The model can be stored/loaded via its :meth:~fse.models.average.Average.save and +:meth:~fse.models.average.Average.load methods.

+

Some important attributes are the following:

+

Attributes

+
+
wv : :class:~gensim.models.keyedvectors.KeyedVectors``
+
This object essentially contains the mapping between words and embeddings. After training, it can be used +directly to query those embeddings in various ways. See the module level docstring for examples.
+
sv : :class:~fse.models.sentencevectors.SentenceVectors``
+
This object contains the sentence vectors inferred from the training data. There will be one such vector +for each unique docusentence supplied during training. They may be individually accessed using the index.
+
prep : :class:~fse.models.base_s2v.BaseSentence2VecPreparer``
+
The prep object is used to transform and initialize the sv.vectors. Aditionally, it can be used +to move the vectors to disk for training with memmap.
+
+

Smooth-inverse frequency (SIF) weighted sentence embeddings model. Performs a weighted averaging operation over all +words in a sentences. After training, the model removes a number of singular vectors.

+

The implementation is based on Arora et al. (2017): A Simple but Tough-to-Beat Baseline for Sentence Embeddings. +For more information, see https://openreview.net/pdf?id=SyK00v5xx and https://github.com/PrincetonML/SIF

+

Parameters

+
+
model : :class:~gensim.models.keyedvectors.KeyedVectorsor `:class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel
+
This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings +the wv.vocab and wv.vector elements are required.
+
alpha : float, optional
+
Alpha is the weighting factor used to downweigh each individual word.
+
components : int, optional
+
Corresponds to the number of singular vectors to remove from the sentence embeddings.
+
cache_size_gb : float, optional
+
Cache size for computing the singular vectors in GB.
+
sv_mapfile_path : str, optional
+
Optional path to store the sentence-vectors in for very large datasets. Used for memmap.
+
wv_mapfile_path : str, optional
+
Optional path to store the word-vectors in for very large datasets. Used for memmap. +Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram.
+
workers : int, optional
+
Number of working threads, used for multithreading. For most tasks (few words in a sentence) +a value of 1 should be more than enough.
+
lang_freq : str, optional
+
Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about +the frequency of a word. As the frequency is required for estimating the word weights, we induce +frequencies into the wv.vocab.count based on :class:~wordfreq +If no frequency information is available, you can choose the language to estimate the frequency. +See https://github.com/LuminosoInsight/wordfreq
+
+
+ +Expand source code + +
class SIF(Average):
+    def __init__(
+        self,
+        model: KeyedVectors,
+        alpha: float = 1e-3,
+        components: int = 1,
+        cache_size_gb: float = 1.0,
+        sv_mapfile_path: str = None,
+        wv_mapfile_path: str = None,
+        workers: int = 1,
+        lang_freq: str = None,
+    ):
+        """Smooth-inverse frequency (SIF) weighted sentence embeddings model. Performs a weighted averaging operation over all
+        words in a sentences. After training, the model removes a number of singular vectors.
+
+        The implementation is based on Arora et al. (2017): A Simple but Tough-to-Beat Baseline for Sentence Embeddings.
+        For more information, see <https://openreview.net/pdf?id=SyK00v5xx> and <https://github.com/PrincetonML/SIF>
+
+        Parameters
+        ----------
+        model : :class:`~gensim.models.keyedvectors.KeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
+            This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings
+            the wv.vocab and wv.vector elements are required.
+        alpha : float, optional
+            Alpha is the weighting factor used to downweigh each individual word.
+        components : int, optional
+            Corresponds to the number of singular vectors to remove from the sentence embeddings.
+        cache_size_gb : float, optional
+            Cache size for computing the singular vectors in GB.
+        sv_mapfile_path : str, optional
+            Optional path to store the sentence-vectors in for very large datasets. Used for memmap.
+        wv_mapfile_path : str, optional
+            Optional path to store the word-vectors in for very large datasets. Used for memmap.
+            Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram.
+        workers : int, optional
+            Number of working threads, used for multithreading. For most tasks (few words in a sentence)
+            a value of 1 should be more than enough.
+        lang_freq : str, optional
+            Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about
+            the frequency of a word. As the frequency is required for estimating the word weights, we induce
+            frequencies into the wv.vocab.count based on :class:`~wordfreq`
+            If no frequency information is available, you can choose the language to estimate the frequency.
+            See https://github.com/LuminosoInsight/wordfreq
+
+        """
+
+        self.alpha = float(alpha)
+        self.components = int(components)
+        self.cache_size_gb = float(cache_size_gb)
+        self.svd_res = None
+
+        if lang_freq is None:
+            logger.info(
+                "make sure you are using a model with valid word-frequency information. Otherwise use lang_freq argument."
+            )
+
+        super(SIF, self).__init__(
+            model=model,
+            sv_mapfile_path=sv_mapfile_path,
+            wv_mapfile_path=wv_mapfile_path,
+            workers=workers,
+            lang_freq=lang_freq,
+        )
+
+    def _check_parameter_sanity(self):
+        """ Check the sanity of all paramters """
+        if not all(self.word_weights <= 1.0) or not all(self.word_weights >= 0.0):
+            raise ValueError("For SIF, all word weights must be 0 <= w_weight <= 1")
+        if self.alpha <= 0.0:
+            raise ValueError("Alpha must be greater than zero.")
+        if self.components < 0.0:
+            raise ValueError("Components must be greater or equal zero")
+
+    def _pre_train_calls(self, **kwargs):
+        """Function calls to perform before training """
+        self._compute_sif_weights()
+
+    def _post_train_calls(self):
+        """ Function calls to perform after training, such as computing eigenvectors """
+        if self.components > 0:
+            self.svd_res = compute_principal_components(
+                self.sv.vectors,
+                components=self.components,
+                cache_size_gb=self.cache_size_gb,
+            )
+            remove_principal_components(
+                self.sv.vectors, svd_res=self.svd_res, inplace=True
+            )
+        else:
+            self.svd_res = 0
+            logger.info(f"no removal of principal components")
+
+    def _post_inference_calls(self, output: ndarray, **kwargs):
+        """ Function calls to perform after training & inference """
+        if self.svd_res is None:
+            raise RuntimeError(
+                "You must first train the model to obtain SVD components"
+            )
+        elif self.components > 0:
+            remove_principal_components(output, svd_res=self.svd_res, inplace=True)
+        else:
+            logger.info(f"no removal of principal components")
+
+    def _check_dtype_santiy(self):
+        """ Check the dtypes of all attributes """
+        if self.word_weights.dtype != REAL:
+            raise TypeError(f"type of word_weights is wrong: {self.word_weights.dtype}")
+        if self.svd_res is not None:
+            if self.svd_res[0].dtype != REAL:
+                raise TypeError(f"type of svd values is wrong: {self.svd_res[0].dtype}")
+            if self.svd_res[1].dtype != REAL:
+                raise TypeError(
+                    f"type of svd components is wrong: {self.svd_res[1].dtype}"
+                )
+
+    def _compute_sif_weights(self):
+        """ Precomputes the SIF weights for all words in the vocabulary """
+        logger.info(f"pre-computing SIF weights for {len(self.wv)} words")
+        v = len(self.wv)
+        corpus_size = 0
+
+        pw = zeros(v, dtype=REAL)
+        for word in self.wv.key_to_index:
+            c = self.wv.get_vecattr(word, "count")
+            if c < 0:
+                raise ValueError("vocab count is negative")
+            corpus_size += c
+            pw[self.wv.key_to_index[word]] = c
+        pw /= corpus_size
+
+        self.word_weights = (self.alpha / (self.alpha + pw)).astype(REAL)
+
+        if not all(isfinite(self.word_weights)) or any(self.word_weights < 0):
+            raise RuntimeError(
+                "Encountered nan values. "
+                "This likely happens because the word frequency information is wrong/missing. "
+                "Consider restarting using lang_freq argument to infer frequency. "
+            )
+
+

Ancestors

+ +

Inherited members

+ +
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/fse/models/usif.html b/docs/fse/models/usif.html new file mode 100644 index 0000000..a9cd3f8 --- /dev/null +++ b/docs/fse/models/usif.html @@ -0,0 +1,489 @@ + + + + + + +fse.models.usif API documentation + + + + + + + + + + + +
+
+
+

Module fse.models.usif

+
+
+
+ +Expand source code + +
#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Author: Oliver Borchers
+# Copyright (C) Oliver Borchers
+
+import logging
+
+from gensim.models.keyedvectors import KeyedVectors
+from numpy import float32 as REAL
+from numpy import isfinite, ndarray, zeros
+
+from fse.models.average import Average
+from fse.models.utils import (
+    EPS,
+    compute_principal_components,
+    remove_principal_components,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class uSIF(Average):
+    def __init__(
+        self,
+        model: KeyedVectors,
+        length: int = None,
+        components: int = 5,
+        cache_size_gb: float = 1.0,
+        sv_mapfile_path: str = None,
+        wv_mapfile_path: str = None,
+        workers: int = 1,
+        lang_freq: str = None,
+    ):
+        """Unsupervised smooth-inverse frequency (uSIF) weighted sentence embeddings
+        model. Performs a weighted averaging operation over all words in a sentences.
+        After training, the model removes a number of weighted singular vectors.
+
+        The implementation is based on Ethayarajh (2018): Unsupervised Random Walk Sentence Embeddings: A Strong but Simple Baseline.
+        For more information, see <https://www.aclweb.org/anthology/W18-3012> and <https://github.com/kawine/usif>
+
+        Parameters
+        ----------
+        model : :class:`~gensim.models.keyedvectors.KeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
+            This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings
+            the wv.vocab and wv.vector elements are required.
+        length : int, optional
+            Corresponds to the average number of words in a sentence in the training corpus.
+            If length is None, then the model takes the average number of words from
+            :meth: `~fse.models.base_s2v.BaseSentence2VecModel.scan_sentences`
+            Is equivalent to n in the paper.
+        components : int, optional
+            Corresponds to the number of singular vectors to remove from the sentence embeddings.
+            Is equivalent to m in the paper.
+        cache_size_gb : float, optional
+            Cache size for computing the singular vectors in GB.
+        sv_mapfile_path : str, optional
+            Optional path to store the sentence-vectors in for very large datasets. Used for memmap.
+        wv_mapfile_path : str, optional
+            Optional path to store the word-vectors in for very large datasets. Used for memmap.
+            Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram.
+        workers : int, optional
+            Number of working threads, used for multithreading. For most tasks (few words in a sentence)
+            a value of 1 should be more than enough.
+        lang_freq : str, optional
+            Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about
+            the frequency of a word. As the frequency is required for estimating the word weights, we induce
+            frequencies into the wv.vocab.count based on :class:`~wordfreq`
+            If no frequency information is available, you can choose the language to estimate the frequency.
+            See https://github.com/LuminosoInsight/wordfreq
+        """
+
+        self.length = length
+        self.components = int(components)
+        self.cache_size_gb = float(cache_size_gb)
+        self.svd_res = None
+        self.svd_weights = None
+
+        if lang_freq is None:
+            logger.info(
+                "make sure you are using a model with valid word-frequency information. Otherwise use lang_freq argument."
+            )
+
+        super(uSIF, self).__init__(
+            model=model,
+            sv_mapfile_path=sv_mapfile_path,
+            wv_mapfile_path=wv_mapfile_path,
+            workers=workers,
+            lang_freq=lang_freq,
+        )
+
+    def _check_parameter_sanity(self):
+        """Check the sanity of all paramters."""
+        if self.length <= 0.0:
+            raise ValueError("Length must be greater than zero.")
+        if self.components < 0.0:
+            raise ValueError("Components must be greater or equal zero")
+
+    def _pre_train_calls(self, **kwargs):
+        """Function calls to perform before training."""
+        self.length = kwargs["average_length"] if self.length is None else self.length
+        self._compute_usif_weights()
+
+    def _post_train_calls(self):
+        """Function calls to perform after training, such as computing eigenvectors."""
+        if self.components > 0:
+            self.svd_res = compute_principal_components(
+                self.sv.vectors,
+                components=self.components,
+                cache_size_gb=self.cache_size_gb,
+            )
+            self.svd_weights = (self.svd_res[0] ** 2) / (
+                self.svd_res[0] ** 2
+            ).sum().astype(REAL)
+            remove_principal_components(
+                self.sv.vectors,
+                svd_res=self.svd_res,
+                weights=self.svd_weights,
+                inplace=True,
+            )
+        else:
+            self.svd_res = 0
+            logger.info(f"no removal of principal components")
+
+    def _post_inference_calls(self, output: ndarray, **kwargs):
+        """Function calls to perform after training & inference."""
+        if self.svd_res is None:
+            raise RuntimeError(
+                "You must first train the model to obtain SVD components"
+            )
+        elif self.components > 0:
+            remove_principal_components(
+                output, svd_res=self.svd_res, weights=self.svd_weights, inplace=True
+            )
+        else:
+            logger.info(f"no removal of principal components")
+
+    def _check_dtype_santiy(self):
+        """Check the dtypes of all attributes."""
+        if self.word_weights.dtype != REAL:
+            raise TypeError(f"type of word_weights is wrong: {self.word_weights.dtype}")
+        if self.svd_res is not None:
+            if self.svd_res[0].dtype != REAL:
+                raise TypeError(f"type of svd values is wrong: {self.svd_res[0].dtype}")
+            if self.svd_res[1].dtype != REAL:
+                raise TypeError(
+                    f"type of svd components is wrong: {self.svd_res[1].dtype}"
+                )
+            if self.svd_weights.dtype != REAL:
+                raise TypeError(
+                    f"type of svd weights is wrong: {self.svd_weights.dtype}"
+                )
+
+    def _compute_usif_weights(self):
+        """Precomputes the uSIF weights."""
+        logger.info(f"pre-computing uSIF weights for {len(self.wv)} words")
+        v = len(self.wv)
+        corpus_size = 0
+
+        pw = zeros(v, dtype=REAL)
+        for word in self.wv.key_to_index:
+            c = self.wv.get_vecattr(word, "count")
+            if c < 0:
+                raise ValueError("vocab count is negative")
+            corpus_size += c
+            pw[self.wv.key_to_index[word]] = c
+        pw /= corpus_size
+
+        threshold = 1 - (1 - (1 / v)) ** self.length
+        alpha = sum(pw > threshold) / v
+        z = v / 2
+        a = (1 - alpha) / ((alpha * z) + EPS)
+
+        self.word_weights = (a / ((a / 2) + pw)).astype(REAL)
+
+        if not all(isfinite(self.word_weights)):
+            raise RuntimeError(
+                "Encountered nan values. "
+                "This likely happens because the word frequency information is wrong/missing. "
+                "Consider restarting using lang_freq argument to infer frequency. "
+            )
+
+
+
+
+
+
+
+
+
+

Classes

+
+
+class uSIF +(model: gensim.models.keyedvectors.KeyedVectors, length: int = None, components: int = 5, cache_size_gb: float = 1.0, sv_mapfile_path: str = None, wv_mapfile_path: str = None, workers: int = 1, lang_freq: str = None) +
+
+

Train, use and evaluate averaged sentence vectors.

+

The model can be stored/loaded via its :meth:~fse.models.average.Average.save and +:meth:~fse.models.average.Average.load methods.

+

Some important attributes are the following:

+

Attributes

+
+
wv : :class:~gensim.models.keyedvectors.KeyedVectors``
+
This object essentially contains the mapping between words and embeddings. After training, it can be used +directly to query those embeddings in various ways. See the module level docstring for examples.
+
sv : :class:~fse.models.sentencevectors.SentenceVectors``
+
This object contains the sentence vectors inferred from the training data. There will be one such vector +for each unique docusentence supplied during training. They may be individually accessed using the index.
+
prep : :class:~fse.models.base_s2v.BaseSentence2VecPreparer``
+
The prep object is used to transform and initialize the sv.vectors. Aditionally, it can be used +to move the vectors to disk for training with memmap.
+
+

Unsupervised smooth-inverse frequency (uSIF) weighted sentence embeddings +model. Performs a weighted averaging operation over all words in a sentences. +After training, the model removes a number of weighted singular vectors.

+

The implementation is based on Ethayarajh (2018): Unsupervised Random Walk Sentence Embeddings: A Strong but Simple Baseline. +For more information, see https://www.aclweb.org/anthology/W18-3012 and https://github.com/kawine/usif

+

Parameters

+
+
model : :class:~gensim.models.keyedvectors.KeyedVectorsor `:class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel
+
This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings +the wv.vocab and wv.vector elements are required.
+
length : int, optional
+
Corresponds to the average number of words in a sentence in the training corpus. +If length is None, then the model takes the average number of words from +:meth: ~fse.models.base_s2v.BaseSentence2VecModel.scan_sentences +Is equivalent to n in the paper.
+
components : int, optional
+
Corresponds to the number of singular vectors to remove from the sentence embeddings. +Is equivalent to m in the paper.
+
cache_size_gb : float, optional
+
Cache size for computing the singular vectors in GB.
+
sv_mapfile_path : str, optional
+
Optional path to store the sentence-vectors in for very large datasets. Used for memmap.
+
wv_mapfile_path : str, optional
+
Optional path to store the word-vectors in for very large datasets. Used for memmap. +Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram.
+
workers : int, optional
+
Number of working threads, used for multithreading. For most tasks (few words in a sentence) +a value of 1 should be more than enough.
+
lang_freq : str, optional
+
Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about +the frequency of a word. As the frequency is required for estimating the word weights, we induce +frequencies into the wv.vocab.count based on :class:~wordfreq +If no frequency information is available, you can choose the language to estimate the frequency. +See https://github.com/LuminosoInsight/wordfreq
+
+
+ +Expand source code + +
class uSIF(Average):
+    def __init__(
+        self,
+        model: KeyedVectors,
+        length: int = None,
+        components: int = 5,
+        cache_size_gb: float = 1.0,
+        sv_mapfile_path: str = None,
+        wv_mapfile_path: str = None,
+        workers: int = 1,
+        lang_freq: str = None,
+    ):
+        """Unsupervised smooth-inverse frequency (uSIF) weighted sentence embeddings
+        model. Performs a weighted averaging operation over all words in a sentences.
+        After training, the model removes a number of weighted singular vectors.
+
+        The implementation is based on Ethayarajh (2018): Unsupervised Random Walk Sentence Embeddings: A Strong but Simple Baseline.
+        For more information, see <https://www.aclweb.org/anthology/W18-3012> and <https://github.com/kawine/usif>
+
+        Parameters
+        ----------
+        model : :class:`~gensim.models.keyedvectors.KeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`
+            This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings
+            the wv.vocab and wv.vector elements are required.
+        length : int, optional
+            Corresponds to the average number of words in a sentence in the training corpus.
+            If length is None, then the model takes the average number of words from
+            :meth: `~fse.models.base_s2v.BaseSentence2VecModel.scan_sentences`
+            Is equivalent to n in the paper.
+        components : int, optional
+            Corresponds to the number of singular vectors to remove from the sentence embeddings.
+            Is equivalent to m in the paper.
+        cache_size_gb : float, optional
+            Cache size for computing the singular vectors in GB.
+        sv_mapfile_path : str, optional
+            Optional path to store the sentence-vectors in for very large datasets. Used for memmap.
+        wv_mapfile_path : str, optional
+            Optional path to store the word-vectors in for very large datasets. Used for memmap.
+            Use sv_mapfile_path and wv_mapfile_path to train disk-to-disk without needing much ram.
+        workers : int, optional
+            Number of working threads, used for multithreading. For most tasks (few words in a sentence)
+            a value of 1 should be more than enough.
+        lang_freq : str, optional
+            Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about
+            the frequency of a word. As the frequency is required for estimating the word weights, we induce
+            frequencies into the wv.vocab.count based on :class:`~wordfreq`
+            If no frequency information is available, you can choose the language to estimate the frequency.
+            See https://github.com/LuminosoInsight/wordfreq
+        """
+
+        self.length = length
+        self.components = int(components)
+        self.cache_size_gb = float(cache_size_gb)
+        self.svd_res = None
+        self.svd_weights = None
+
+        if lang_freq is None:
+            logger.info(
+                "make sure you are using a model with valid word-frequency information. Otherwise use lang_freq argument."
+            )
+
+        super(uSIF, self).__init__(
+            model=model,
+            sv_mapfile_path=sv_mapfile_path,
+            wv_mapfile_path=wv_mapfile_path,
+            workers=workers,
+            lang_freq=lang_freq,
+        )
+
+    def _check_parameter_sanity(self):
+        """Check the sanity of all paramters."""
+        if self.length <= 0.0:
+            raise ValueError("Length must be greater than zero.")
+        if self.components < 0.0:
+            raise ValueError("Components must be greater or equal zero")
+
+    def _pre_train_calls(self, **kwargs):
+        """Function calls to perform before training."""
+        self.length = kwargs["average_length"] if self.length is None else self.length
+        self._compute_usif_weights()
+
+    def _post_train_calls(self):
+        """Function calls to perform after training, such as computing eigenvectors."""
+        if self.components > 0:
+            self.svd_res = compute_principal_components(
+                self.sv.vectors,
+                components=self.components,
+                cache_size_gb=self.cache_size_gb,
+            )
+            self.svd_weights = (self.svd_res[0] ** 2) / (
+                self.svd_res[0] ** 2
+            ).sum().astype(REAL)
+            remove_principal_components(
+                self.sv.vectors,
+                svd_res=self.svd_res,
+                weights=self.svd_weights,
+                inplace=True,
+            )
+        else:
+            self.svd_res = 0
+            logger.info(f"no removal of principal components")
+
+    def _post_inference_calls(self, output: ndarray, **kwargs):
+        """Function calls to perform after training & inference."""
+        if self.svd_res is None:
+            raise RuntimeError(
+                "You must first train the model to obtain SVD components"
+            )
+        elif self.components > 0:
+            remove_principal_components(
+                output, svd_res=self.svd_res, weights=self.svd_weights, inplace=True
+            )
+        else:
+            logger.info(f"no removal of principal components")
+
+    def _check_dtype_santiy(self):
+        """Check the dtypes of all attributes."""
+        if self.word_weights.dtype != REAL:
+            raise TypeError(f"type of word_weights is wrong: {self.word_weights.dtype}")
+        if self.svd_res is not None:
+            if self.svd_res[0].dtype != REAL:
+                raise TypeError(f"type of svd values is wrong: {self.svd_res[0].dtype}")
+            if self.svd_res[1].dtype != REAL:
+                raise TypeError(
+                    f"type of svd components is wrong: {self.svd_res[1].dtype}"
+                )
+            if self.svd_weights.dtype != REAL:
+                raise TypeError(
+                    f"type of svd weights is wrong: {self.svd_weights.dtype}"
+                )
+
+    def _compute_usif_weights(self):
+        """Precomputes the uSIF weights."""
+        logger.info(f"pre-computing uSIF weights for {len(self.wv)} words")
+        v = len(self.wv)
+        corpus_size = 0
+
+        pw = zeros(v, dtype=REAL)
+        for word in self.wv.key_to_index:
+            c = self.wv.get_vecattr(word, "count")
+            if c < 0:
+                raise ValueError("vocab count is negative")
+            corpus_size += c
+            pw[self.wv.key_to_index[word]] = c
+        pw /= corpus_size
+
+        threshold = 1 - (1 - (1 / v)) ** self.length
+        alpha = sum(pw > threshold) / v
+        z = v / 2
+        a = (1 - alpha) / ((alpha * z) + EPS)
+
+        self.word_weights = (a / ((a / 2) + pw)).astype(REAL)
+
+        if not all(isfinite(self.word_weights)):
+            raise RuntimeError(
+                "Encountered nan values. "
+                "This likely happens because the word frequency information is wrong/missing. "
+                "Consider restarting using lang_freq argument to infer frequency. "
+            )
+
+

Ancestors

+ +

Inherited members

+ +
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/fse/models/utils.html b/docs/fse/models/utils.html new file mode 100644 index 0000000..2b5666f --- /dev/null +++ b/docs/fse/models/utils.html @@ -0,0 +1,495 @@ + + + + + + +fse.models.utils API documentation + + + + + + + + + + + +
+
+
+

Module fse.models.utils

+
+
+
+ +Expand source code + +
#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Author: Oliver Borchers
+# Copyright (C) Oliver Borchers
+
+from typing import Tuple
+from sklearn.decomposition import TruncatedSVD
+
+from numpy import finfo, ndarray, float32 as REAL, ones, dtype
+from numpy.random import choice
+
+from time import time
+
+import logging
+
+from sys import platform
+
+import ctypes
+
+logger = logging.getLogger(__name__)
+
+EPS = finfo(REAL).eps
+
+
+def set_madvise_for_mmap(return_madvise: bool = False) -> object:
+    """Method used to set madvise parameters.
+    This problem adresses the memmap issue raised in https://github.com/numpy/numpy/issues/13172
+    The issue is not applicable for windows
+
+    Parameters
+    ----------
+    return_madvise : bool
+        Returns the madvise object for unittests, se test_utils.py
+
+    Returns
+    -------
+    object
+        madvise object
+
+    """
+
+    if platform in ["linux", "linux2", "darwin", "aix"]:
+        if platform == "darwin":
+            # Path different for Macos
+            madvise = ctypes.CDLL("libc.dylib").madvise
+        if platform in ["linux", "linux2", "aix"]:
+            madvise = ctypes.CDLL("libc.so.6").madvise
+        madvise.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int]
+        madvise.restype = ctypes.c_int
+
+        if return_madvise:
+            return madvise
+
+
+def compute_principal_components(
+    vectors: ndarray, components: int = 1, cache_size_gb: float = 1.0
+) -> Tuple[ndarray, ndarray]:
+    """Method used to compute the first singular vectors of a given (sub)matrix
+
+    Parameters
+    ----------
+    vectors : ndarray
+        (Sentence) vectors to compute the truncated SVD on
+    components : int, optional
+        Number of singular values/vectors to compute
+    cache_size_gb : float, optional
+            Cache size for computing the principal components in GB
+
+    Returns
+    -------
+    ndarray, ndarray
+        Singular values and singular vectors
+    """
+    start = time()
+    num_vectors = vectors.shape[0]
+    svd = TruncatedSVD(
+        n_components=components, n_iter=7, random_state=42, algorithm="randomized"
+    )
+
+    sample_size = int(
+        1024 ** 3 * cache_size_gb / (vectors.shape[1] * dtype(REAL).itemsize)
+    )
+
+    if sample_size > num_vectors:
+        svd.fit(vectors)
+    else:
+        logger.info(f"sampling {sample_size} vectors to compute principal components")
+        sample_indices = choice(range(num_vectors), replace=False, size=int(1e6))
+        svd.fit(vectors[sample_indices, :])
+
+    elapsed = time()
+    logger.info(
+        f"computing {components} principal components took {int(elapsed-start)}s"
+    )
+    return svd.singular_values_.astype(REAL), svd.components_.astype(REAL)
+
+
+def remove_principal_components(
+    vectors: ndarray,
+    svd_res: Tuple[ndarray, ndarray],
+    weights: ndarray = None,
+    inplace: bool = True,
+) -> ndarray:
+    """Method used to remove the first singular vectors of a given matrix
+
+    Parameters
+    ----------
+    vectors : ndarray
+        (Sentence) vectors to remove components fromm
+    svd_res : (ndarray, ndarray)
+        Tuple consisting of the singular values and components to remove from the vectors
+    weights : ndarray, optional
+        Weights to be used to weigh the components which are removed from the vectors
+    inplace : bool, optional
+        If true, removes the components from the vectors inplace (memory efficient)
+
+    Returns
+    -------
+    ndarray, ndarray
+        Singular values and singular vectors
+    """
+    components = svd_res[1].astype(REAL)
+
+    start = time()
+    if weights is None:
+        w_comp = components * ones(len(components), dtype=REAL)[:, None]
+    else:
+        w_comp = components * (weights[:, None].astype(REAL))
+
+    output = None
+    if len(components) == 1:
+        if not inplace:
+            output = vectors - vectors.dot(w_comp.transpose()) * w_comp
+        else:
+            vectors -= vectors.dot(w_comp.transpose()) * w_comp
+    else:
+        if not inplace:
+            output = vectors - vectors.dot(w_comp.transpose()).dot(w_comp)
+        else:
+            vectors -= vectors.dot(w_comp.transpose()).dot(w_comp)
+    elapsed = time()
+
+    logger.info(
+        f"removing {len(components)} principal components took {int(elapsed-start)}s"
+    )
+    if not inplace:
+        return output
+
+
+
+
+
+
+
+

Functions

+
+
+def choice(a, size=None, replace=True, p=None) +
+
+

Generates a random sample from a given 1-D array

+
+

Added in version: 1.7.0

+
+
+

Note

+

New code should use the RandomState.choice() method of a default_rng() +instance instead; please see the :ref:random-quick-start.

+
+

Parameters

+
+
a : 1-D array-like or int
+
If an ndarray, a random sample is generated from its elements. +If an int, the random sample is generated as if it were np.arange(a)
+
size : int or tuple of ints, optional
+
Output shape. +If the given shape is, e.g., (m, n, k), then +m * n * k samples are drawn. +Default is None, in which case a +single value is returned.
+
replace : boolean, optional
+
Whether the sample is with or without replacement. Default is True, +meaning that a value of a can be selected multiple times.
+
p : 1-D array-like, optional
+
The probabilities associated with each entry in a. +If not given, the sample assumes a uniform distribution over all +entries in a.
+
+

Returns

+
+
samples : single item or ndarray
+
The generated random samples
+
+

Raises

+
+
ValueError
+
If a is an int and less than zero, if a or p are not 1-dimensional, +if a is an array-like of size 0, if p is not a vector of +probabilities, if a and p have different lengths, or if +replace=False and the sample size is greater than the population +size
+
+

See Also

+

randint, shuffle, permutation +Generator.choice: which should be used in new code

+

Notes

+

Setting user-specified probabilities through p uses a more general but less +efficient sampler than the default. The general sampler produces a different sample +than the optimized sampler even if each element of p is 1 / len(a).

+

Sampling random rows from a 2-D array is not possible with this function, +but is possible with Generator.choice through its axis keyword.

+

Examples

+

Generate a uniform random sample from np.arange(5) of size 3:

+
>>> np.random.choice(5, 3)
+array([0, 3, 4]) # random
+>>> #This is equivalent to np.random.randint(0,5,3)
+
+

Generate a non-uniform random sample from np.arange(5) of size 3:

+
>>> np.random.choice(5, 3, p=[0.1, 0, 0.3, 0.6, 0])
+array([3, 3, 0]) # random
+
+

Generate a uniform random sample from np.arange(5) of size 3 without +replacement:

+
>>> np.random.choice(5, 3, replace=False)
+array([3,1,0]) # random
+>>> #This is equivalent to np.random.permutation(np.arange(5))[:3]
+
+

Generate a non-uniform random sample from np.arange(5) of size +3 without replacement:

+
>>> np.random.choice(5, 3, replace=False, p=[0.1, 0, 0.3, 0.6, 0])
+array([2, 3, 0]) # random
+
+

Any of the above can be repeated with an arbitrary array-like +instead of just integers. For instance:

+
>>> aa_milne_arr = ['pooh', 'rabbit', 'piglet', 'Christopher']
+>>> np.random.choice(aa_milne_arr, 5, p=[0.5, 0.1, 0.1, 0.3])
+array(['pooh', 'pooh', 'pooh', 'Christopher', 'piglet'], # random
+      dtype='<U11')
+
+
+
+def compute_principal_components(vectors: numpy.ndarray, components: int = 1, cache_size_gb: float = 1.0) ‑> Tuple[numpy.ndarray, numpy.ndarray] +
+
+

Method used to compute the first singular vectors of a given (sub)matrix

+

Parameters

+
+
vectors : ndarray
+
(Sentence) vectors to compute the truncated SVD on
+
components : int, optional
+
Number of singular values/vectors to compute
+
cache_size_gb : float, optional
+
Cache size for computing the principal components in GB
+
+

Returns

+
+
ndarray, ndarray
+
Singular values and singular vectors
+
+
+ +Expand source code + +
def compute_principal_components(
+    vectors: ndarray, components: int = 1, cache_size_gb: float = 1.0
+) -> Tuple[ndarray, ndarray]:
+    """Method used to compute the first singular vectors of a given (sub)matrix
+
+    Parameters
+    ----------
+    vectors : ndarray
+        (Sentence) vectors to compute the truncated SVD on
+    components : int, optional
+        Number of singular values/vectors to compute
+    cache_size_gb : float, optional
+            Cache size for computing the principal components in GB
+
+    Returns
+    -------
+    ndarray, ndarray
+        Singular values and singular vectors
+    """
+    start = time()
+    num_vectors = vectors.shape[0]
+    svd = TruncatedSVD(
+        n_components=components, n_iter=7, random_state=42, algorithm="randomized"
+    )
+
+    sample_size = int(
+        1024 ** 3 * cache_size_gb / (vectors.shape[1] * dtype(REAL).itemsize)
+    )
+
+    if sample_size > num_vectors:
+        svd.fit(vectors)
+    else:
+        logger.info(f"sampling {sample_size} vectors to compute principal components")
+        sample_indices = choice(range(num_vectors), replace=False, size=int(1e6))
+        svd.fit(vectors[sample_indices, :])
+
+    elapsed = time()
+    logger.info(
+        f"computing {components} principal components took {int(elapsed-start)}s"
+    )
+    return svd.singular_values_.astype(REAL), svd.components_.astype(REAL)
+
+
+
+def remove_principal_components(vectors: numpy.ndarray, svd_res: Tuple[numpy.ndarray, numpy.ndarray], weights: numpy.ndarray = None, inplace: bool = True) ‑> numpy.ndarray +
+
+

Method used to remove the first singular vectors of a given matrix

+

Parameters

+
+
vectors : ndarray
+
(Sentence) vectors to remove components fromm
+
svd_res : (ndarray, ndarray)
+
Tuple consisting of the singular values and components to remove from the vectors
+
weights : ndarray, optional
+
Weights to be used to weigh the components which are removed from the vectors
+
inplace : bool, optional
+
If true, removes the components from the vectors inplace (memory efficient)
+
+

Returns

+
+
ndarray, ndarray
+
Singular values and singular vectors
+
+
+ +Expand source code + +
def remove_principal_components(
+    vectors: ndarray,
+    svd_res: Tuple[ndarray, ndarray],
+    weights: ndarray = None,
+    inplace: bool = True,
+) -> ndarray:
+    """Method used to remove the first singular vectors of a given matrix
+
+    Parameters
+    ----------
+    vectors : ndarray
+        (Sentence) vectors to remove components fromm
+    svd_res : (ndarray, ndarray)
+        Tuple consisting of the singular values and components to remove from the vectors
+    weights : ndarray, optional
+        Weights to be used to weigh the components which are removed from the vectors
+    inplace : bool, optional
+        If true, removes the components from the vectors inplace (memory efficient)
+
+    Returns
+    -------
+    ndarray, ndarray
+        Singular values and singular vectors
+    """
+    components = svd_res[1].astype(REAL)
+
+    start = time()
+    if weights is None:
+        w_comp = components * ones(len(components), dtype=REAL)[:, None]
+    else:
+        w_comp = components * (weights[:, None].astype(REAL))
+
+    output = None
+    if len(components) == 1:
+        if not inplace:
+            output = vectors - vectors.dot(w_comp.transpose()) * w_comp
+        else:
+            vectors -= vectors.dot(w_comp.transpose()) * w_comp
+    else:
+        if not inplace:
+            output = vectors - vectors.dot(w_comp.transpose()).dot(w_comp)
+        else:
+            vectors -= vectors.dot(w_comp.transpose()).dot(w_comp)
+    elapsed = time()
+
+    logger.info(
+        f"removing {len(components)} principal components took {int(elapsed-start)}s"
+    )
+    if not inplace:
+        return output
+
+
+
+def set_madvise_for_mmap(return_madvise: bool = False) ‑> object +
+
+

Method used to set madvise parameters. +This problem adresses the memmap issue raised in https://github.com/numpy/numpy/issues/13172 +The issue is not applicable for windows

+

Parameters

+
+
return_madvise : bool
+
Returns the madvise object for unittests, se test_utils.py
+
+

Returns

+
+
object
+
madvise object
+
+
+ +Expand source code + +
def set_madvise_for_mmap(return_madvise: bool = False) -> object:
+    """Method used to set madvise parameters.
+    This problem adresses the memmap issue raised in https://github.com/numpy/numpy/issues/13172
+    The issue is not applicable for windows
+
+    Parameters
+    ----------
+    return_madvise : bool
+        Returns the madvise object for unittests, se test_utils.py
+
+    Returns
+    -------
+    object
+        madvise object
+
+    """
+
+    if platform in ["linux", "linux2", "darwin", "aix"]:
+        if platform == "darwin":
+            # Path different for Macos
+            madvise = ctypes.CDLL("libc.dylib").madvise
+        if platform in ["linux", "linux2", "aix"]:
+            madvise = ctypes.CDLL("libc.so.6").madvise
+        madvise.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int]
+        madvise.restype = ctypes.c_int
+
+        if return_madvise:
+            return madvise
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/fse/vectors.html b/docs/fse/vectors.html new file mode 100644 index 0000000..a851450 --- /dev/null +++ b/docs/fse/vectors.html @@ -0,0 +1,421 @@ + + + + + + +fse.vectors API documentation + + + + + + + + + + + +
+
+
+

Module fse.vectors

+
+
+

Class to obtain BaseKeyedVector from.

+
+ +Expand source code + +
#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Author: Oliver Borchers
+# Copyright (C) Oliver Borchers
+# Licensed under GNU General Public License v3.0
+
+"""Class to obtain BaseKeyedVector from."""
+
+from pathlib import Path
+
+from gensim.models.fasttext import FastTextKeyedVectors
+from gensim.models.keyedvectors import KeyedVectors
+from huggingface_hub import snapshot_download
+from requests import HTTPError
+
+_SUFFIX: str = ".model"
+
+
+class Vectors(KeyedVectors):
+    """Class to instantiates vectors from pretrained models."""
+
+    @classmethod
+    def from_pretrained(cls, model: str, mmap: str = None):
+        """Method to load vectors from a pre-trained model.
+
+        Parameters
+        ----------
+        model : :str: of the model name to load from the bug. For example: "glove-wiki-gigaword-50"
+        mmap : :str: If to load the vectors in mmap mode.
+
+        Returns
+        -------
+        Vectors
+            An object of pretrained vectors.
+        """
+        try:
+            path = Path(snapshot_download(repo_id=f"fse/{model}"))
+        except HTTPError as err:
+            if err.response.status_code == 404:
+                raise ValueError(f"model {model} does not exist")
+            raise
+
+        assert path.exists(), "something went wrong. the file wasn't downloaded."
+
+        return super(Vectors, cls).load(
+            (path / (model + _SUFFIX)).as_posix(), mmap=mmap
+        )
+
+
+class FTVectors(FastTextKeyedVectors):
+    """Class to instantiates FT vectors from pretrained models."""
+
+    @classmethod
+    def from_pretrained(cls, model: str, mmap: str = None):
+        """Method to load vectors from a pre-trained model.
+
+        Parameters
+        ----------
+        model : :str: of the model name to load from the bug. For example: "glove-wiki-gigaword-50"
+        mmap : :str: If to load the vectors in mmap mode.
+
+        Returns
+        -------
+        Vectors
+            An object of pretrained vectors.
+        """
+        try:
+            path = Path(snapshot_download(repo_id=f"fse/{model}"))
+        except HTTPError as err:
+            if err.response.status_code == 404:
+                raise ValueError(f"model {model} does not exist")
+            raise
+
+        assert path.exists(), "something went wrong. the file wasn't downloaded."
+
+        return super(FTVectors, cls).load(
+            (path / (model + _SUFFIX)).as_posix(), mmap=mmap
+        )
+
+
+
+
+
+
+
+
+
+

Classes

+
+
+class FTVectors +(vector_size, min_n, max_n, bucket, count=0, dtype=numpy.float32) +
+
+

Class to instantiates FT vectors from pretrained models.

+

Vectors and vocab for :class:~gensim.models.fasttext.FastText.

+

Implements significant parts of the FastText algorithm. +For example, +the :func:word_vec calculates vectors for out-of-vocabulary (OOV) +entities. +FastText achieves this by keeping vectors for ngrams: +adding the vectors for the ngrams of an entity yields the vector for the +entity.

+

Similar to a hashmap, this class keeps a fixed number of buckets, and +maps all ngrams to buckets using a hash function.

+

Parameters

+
+
vector_size : int
+
The dimensionality of all vectors.
+
min_n : int
+
The minimum number of characters in an ngram
+
max_n : int
+
The maximum number of characters in an ngram
+
bucket : int
+
The number of buckets.
+
count : int, optional
+
If provided, vectors will be pre-allocated for at least this many vectors. (Otherwise +they can be added later.)
+
dtype : type, optional
+
Vector dimensions will default to np.float32 (AKA REAL in some Gensim code) unless +another type is provided here.
+
+

Attributes

+
+
vectors_vocab : np.array
+
Each row corresponds to a vector for an entity in the vocabulary. +Columns correspond to vector dimensions. When embedded in a full +FastText model, these are the full-word-token vectors updated +by training, whereas the inherited vectors are the actual per-word +vectors synthesized from the full-word-token and all subword (ngram) +vectors.
+
vectors_ngrams : np.array
+
A vector for each ngram across all entities in the vocabulary. +Each row is a vector that corresponds to a bucket. +Columns correspond to vector dimensions.
+
buckets_word : list of np.array
+
For each key (by its index), report bucket slots their subwords map to.
+
+

When used in training, FastTextKeyedVectors may be decorated with +extra attributes that closely associate with its core attributes, +such as the experimental vectors_vocab_lockf and vectors_ngrams_lockf +training-update-dampening factors.

+
+ +Expand source code + +
class FTVectors(FastTextKeyedVectors):
+    """Class to instantiates FT vectors from pretrained models."""
+
+    @classmethod
+    def from_pretrained(cls, model: str, mmap: str = None):
+        """Method to load vectors from a pre-trained model.
+
+        Parameters
+        ----------
+        model : :str: of the model name to load from the bug. For example: "glove-wiki-gigaword-50"
+        mmap : :str: If to load the vectors in mmap mode.
+
+        Returns
+        -------
+        Vectors
+            An object of pretrained vectors.
+        """
+        try:
+            path = Path(snapshot_download(repo_id=f"fse/{model}"))
+        except HTTPError as err:
+            if err.response.status_code == 404:
+                raise ValueError(f"model {model} does not exist")
+            raise
+
+        assert path.exists(), "something went wrong. the file wasn't downloaded."
+
+        return super(FTVectors, cls).load(
+            (path / (model + _SUFFIX)).as_posix(), mmap=mmap
+        )
+
+

Ancestors

+
    +
  • gensim.models.fasttext.FastTextKeyedVectors
  • +
  • gensim.models.keyedvectors.KeyedVectors
  • +
  • gensim.utils.SaveLoad
  • +
+

Static methods

+
+
+def from_pretrained(model: str, mmap: str = None) +
+
+

Method to load vectors from a pre-trained model.

+

Parameters

+
+
model : :str: of the model name to load from the bug. For example: "glove-wiki-gigaword-50"
+
 
+
+

mmap : :str: If to load the vectors in mmap mode.

+

Returns

+
+
Vectors
+
An object of pretrained vectors.
+
+
+ +Expand source code + +
@classmethod
+def from_pretrained(cls, model: str, mmap: str = None):
+    """Method to load vectors from a pre-trained model.
+
+    Parameters
+    ----------
+    model : :str: of the model name to load from the bug. For example: "glove-wiki-gigaword-50"
+    mmap : :str: If to load the vectors in mmap mode.
+
+    Returns
+    -------
+    Vectors
+        An object of pretrained vectors.
+    """
+    try:
+        path = Path(snapshot_download(repo_id=f"fse/{model}"))
+    except HTTPError as err:
+        if err.response.status_code == 404:
+            raise ValueError(f"model {model} does not exist")
+        raise
+
+    assert path.exists(), "something went wrong. the file wasn't downloaded."
+
+    return super(FTVectors, cls).load(
+        (path / (model + _SUFFIX)).as_posix(), mmap=mmap
+    )
+
+
+
+
+
+class Vectors +(vector_size, count=0, dtype=numpy.float32, mapfile_path=None) +
+
+

Class to instantiates vectors from pretrained models.

+

Mapping between keys (such as words) and vectors for :class:~gensim.models.Word2Vec +and related models.

+

Used to perform operations on the vectors such as vector lookup, distance, similarity etc.

+

To support the needs of specific models and other downstream uses, you can also set +additional attributes via the :meth:~gensim.models.keyedvectors.KeyedVectors.set_vecattr +and :meth:~gensim.models.keyedvectors.KeyedVectors.get_vecattr methods. +Note that all such attributes under the same attr name must have compatible numpy +types, as the type and storage array for such attributes is established by the 1st time such +attr is set.

+

Parameters

+
+
vector_size : int
+
Intended number of dimensions for all contained vectors.
+
count : int, optional
+
If provided, vectors wil be pre-allocated for at least this many vectors. (Otherwise +they can be added later.)
+
dtype : type, optional
+
Vector dimensions will default to np.float32 (AKA REAL in some Gensim code) unless +another type is provided here.
+
mapfile_path : string, optional
+
Currently unused.
+
+
+ +Expand source code + +
class Vectors(KeyedVectors):
+    """Class to instantiates vectors from pretrained models."""
+
+    @classmethod
+    def from_pretrained(cls, model: str, mmap: str = None):
+        """Method to load vectors from a pre-trained model.
+
+        Parameters
+        ----------
+        model : :str: of the model name to load from the bug. For example: "glove-wiki-gigaword-50"
+        mmap : :str: If to load the vectors in mmap mode.
+
+        Returns
+        -------
+        Vectors
+            An object of pretrained vectors.
+        """
+        try:
+            path = Path(snapshot_download(repo_id=f"fse/{model}"))
+        except HTTPError as err:
+            if err.response.status_code == 404:
+                raise ValueError(f"model {model} does not exist")
+            raise
+
+        assert path.exists(), "something went wrong. the file wasn't downloaded."
+
+        return super(Vectors, cls).load(
+            (path / (model + _SUFFIX)).as_posix(), mmap=mmap
+        )
+
+

Ancestors

+
    +
  • gensim.models.keyedvectors.KeyedVectors
  • +
  • gensim.utils.SaveLoad
  • +
+

Static methods

+
+
+def from_pretrained(model: str, mmap: str = None) +
+
+

Method to load vectors from a pre-trained model.

+

Parameters

+
+
model : :str: of the model name to load from the bug. For example: "glove-wiki-gigaword-50"
+
 
+
+

mmap : :str: If to load the vectors in mmap mode.

+

Returns

+
+
Vectors
+
An object of pretrained vectors.
+
+
+ +Expand source code + +
@classmethod
+def from_pretrained(cls, model: str, mmap: str = None):
+    """Method to load vectors from a pre-trained model.
+
+    Parameters
+    ----------
+    model : :str: of the model name to load from the bug. For example: "glove-wiki-gigaword-50"
+    mmap : :str: If to load the vectors in mmap mode.
+
+    Returns
+    -------
+    Vectors
+        An object of pretrained vectors.
+    """
+    try:
+        path = Path(snapshot_download(repo_id=f"fse/{model}"))
+    except HTTPError as err:
+        if err.response.status_code == 404:
+            raise ValueError(f"model {model} does not exist")
+        raise
+
+    assert path.exists(), "something went wrong. the file wasn't downloaded."
+
+    return super(Vectors, cls).load(
+        (path / (model + _SUFFIX)).as_posix(), mmap=mmap
+    )
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/fse/inputs.py b/fse/inputs.py index 60ea7fc..562138a 100644 --- a/fse/inputs.py +++ b/fse/inputs.py @@ -365,7 +365,8 @@ def _build_offsets(self): offset += len(line) def __getitem__(self, i): - """Returns the line indexed by i. Primarily used for + """Returns the line indexed by i. Primarily used for. + :meth:`~fse.models.sentencevectors.SentenceVectors.most_similar` Parameters @@ -377,7 +378,6 @@ def __getitem__(self, i): ------- str line at the current index - """ if not self.get_able: raise RuntimeError( diff --git a/fse/models/average.py b/fse/models/average.py index e87db63..ffd6438 100644 --- a/fse/models/average.py +++ b/fse/models/average.py @@ -20,7 +20,7 @@ >>> from gensim.models.word2vec import Word2Vec >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] - >>> model = Word2Vec(sentences, min_count=1, size=20) + >>> model = Word2Vec(sentences, min_count=1, vector_size=20) >>> from fse.models.average import Average >>> avg = Average(model) @@ -34,8 +34,8 @@ from fse.models.base_s2v import BaseSentence2VecModel -from gensim.models.keyedvectors import BaseKeyedVectors -from gensim.models.utils_any2vec import ft_ngram_hashes +from gensim.models.keyedvectors import KeyedVectors +from gensim.models.fasttext import ft_ngram_hashes from numpy import ( ndarray, @@ -46,7 +46,7 @@ max as np_max, ) -from typing import List +from typing import List, Tuple import logging @@ -58,7 +58,7 @@ def train_average_np( indexed_sentences: List[tuple], target: ndarray, memory: ndarray, -) -> [int, int]: +) -> Tuple[int, int]: """Training on a sequence of sentences and update the target ndarray. Called internally from :meth:`~fse.models.average.Average._do_train_job`. @@ -88,7 +88,6 @@ def train_average_np( """ size = model.wv.vector_size - vocab = model.wv.vocab w_vectors = model.wv.vectors w_weights = model.word_weights @@ -121,7 +120,11 @@ def train_average_np( sent = obj[0] sent_adr = obj[1] - word_indices = [vocab[word].index for word in sent if word in vocab] + word_indices = [ + model.wv.key_to_index[word] + for word in sent + if word in model.wv.key_to_index + ] eff_sentences += 1 if not len(word_indices): continue @@ -147,11 +150,11 @@ def train_average_np( eff_words += len(sent) # Counts everything in the sentence for word in sent: - if word in vocab: - word_index = vocab[word].index + if word in model.wv.key_to_index: + word_index = model.wv.key_to_index[word] mem += w_vectors[word_index] * w_weights[word_index] else: - ngram_hashes = ft_ngram_hashes(word, min_n, max_n, bucket, True)[ + ngram_hashes = ft_ngram_hashes(word, min_n, max_n, bucket)[ :max_ngrams ] if len(ngram_hashes) == 0: @@ -191,7 +194,7 @@ class Average(BaseSentence2VecModel): Attributes ---------- - wv : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` + wv : :class:`~gensim.models.keyedvectors.KeyedVectors` This object essentially contains the mapping between words and embeddings. After training, it can be used directly to query those embeddings in various ways. See the module level docstring for examples. @@ -207,7 +210,7 @@ class Average(BaseSentence2VecModel): def __init__( self, - model: BaseKeyedVectors, + model: KeyedVectors, sv_mapfile_path: str = None, wv_mapfile_path: str = None, workers: int = 1, @@ -222,7 +225,7 @@ def __init__( Parameters ---------- - model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` + model : :class:`~gensim.models.keyedvectors.KeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings the wv.vocab and wv.vector elements are required. sv_mapfile_path : str, optional @@ -255,7 +258,7 @@ def __init__( def _do_train_job( self, data_iterable: List[tuple], target: ndarray, memory: ndarray - ) -> [int, int]: + ) -> Tuple[int, int]: """ Internal routine which is called on training and performs averaging for all entries in the iterable """ eff_sentences, eff_words = train_average( model=self, indexed_sentences=data_iterable, target=target, memory=memory diff --git a/fse/models/average_inner.c b/fse/models/average_inner.c index 204595f..ac7f8be 100644 --- a/fse/models/average_inner.c +++ b/fse/models/average_inner.c @@ -1815,7 +1815,6 @@ static const char __pyx_k_main[] = "__main__"; static const char __pyx_k_name[] = "__name__"; static const char __pyx_k_test[] = "__test__"; static const char __pyx_k_fblas[] = "fblas"; -static const char __pyx_k_index[] = "index"; static const char __pyx_k_is_ft[] = "is_ft"; static const char __pyx_k_max_n[] = "max_n"; static const char __pyx_k_min_n[] = "min_n"; @@ -1824,7 +1823,6 @@ static const char __pyx_k_numpy[] = "numpy"; static const char __pyx_k_range[] = "range"; static const char __pyx_k_saxpy[] = "saxpy"; static const char __pyx_k_sscal[] = "sscal"; -static const char __pyx_k_vocab[] = "vocab"; static const char __pyx_k_bucket[] = "bucket"; static const char __pyx_k_import[] = "__import__"; static const char __pyx_k_memory[] = "memory"; @@ -1838,6 +1836,7 @@ static const char __pyx_k_enumerate[] = "enumerate"; static const char __pyx_k_ImportError[] = "ImportError"; static const char __pyx_k_vector_size[] = "vector_size"; static const char __pyx_k_FAST_VERSION[] = "FAST_VERSION"; +static const char __pyx_k_key_to_index[] = "key_to_index"; static const char __pyx_k_word_weights[] = "word_weights"; static const char __pyx_k_eff_sentences[] = "eff_sentences"; static const char __pyx_k_ft_hash_bytes[] = "ft_hash_bytes"; @@ -1850,9 +1849,9 @@ static const char __pyx_k_MAX_WORDS_IN_BATCH[] = "MAX_WORDS_IN_BATCH"; static const char __pyx_k_cline_in_traceback[] = "cline_in_traceback"; static const char __pyx_k_MAX_NGRAMS_IN_BATCH[] = "MAX_NGRAMS_IN_BATCH"; static const char __pyx_k_compute_ngrams_bytes[] = "compute_ngrams_bytes"; +static const char __pyx_k_gensim_models_fasttext[] = "gensim.models.fasttext"; static const char __pyx_k_fse_models_average_inner[] = "fse.models.average_inner"; static const char __pyx_k_fse_models_average_inner_pyx[] = "fse/models/average_inner.pyx"; -static const char __pyx_k_gensim_models__utils_any2vec[] = "gensim.models._utils_any2vec"; static const char __pyx_k_numpy_core_multiarray_failed_to[] = "numpy.core.multiarray failed to import"; static const char __pyx_k_Optimized_cython_functions_for_c[] = "Optimized cython functions for computing sentence embeddings"; static const char __pyx_k_numpy_core_umath_failed_to_impor[] = "numpy.core.umath failed to import"; @@ -1874,12 +1873,12 @@ static PyObject *__pyx_n_s_fse_models_average_inner; static PyObject *__pyx_kp_s_fse_models_average_inner_pyx; static PyObject *__pyx_n_s_ft; static PyObject *__pyx_n_s_ft_hash_bytes; -static PyObject *__pyx_n_s_gensim_models__utils_any2vec; +static PyObject *__pyx_n_s_gensim_models_fasttext; static PyObject *__pyx_n_s_import; -static PyObject *__pyx_n_s_index; static PyObject *__pyx_n_s_indexed_sentences; static PyObject *__pyx_n_s_init; static PyObject *__pyx_n_s_is_ft; +static PyObject *__pyx_n_s_key_to_index; static PyObject *__pyx_n_s_main; static PyObject *__pyx_n_s_max; static PyObject *__pyx_n_s_max_n; @@ -1904,7 +1903,6 @@ static PyObject *__pyx_n_s_vector_size; static PyObject *__pyx_n_s_vectors; static PyObject *__pyx_n_s_vectors_ngrams; static PyObject *__pyx_n_s_vectors_vocab; -static PyObject *__pyx_n_s_vocab; static PyObject *__pyx_n_s_w2v; static PyObject *__pyx_n_s_word_weights; static PyObject *__pyx_n_s_workers; @@ -2279,7 +2277,7 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_init_ft_s2v_config(struct * * c[0].sentence_vectors = (np.PyArray_DATA(target)) # <<<<<<<<<<<<<< * - * cdef object populate_base_s2v_config(BaseSentenceVecsConfig *c, vocab, indexed_sentences): + * cdef object populate_base_s2v_config(BaseSentenceVecsConfig *c, wv, indexed_sentences): */ if (!(likely(((__pyx_v_target) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_target, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 97, __pyx_L1_error) (__pyx_v_c[0]).sentence_vectors = ((__pyx_t_3fse_6models_13average_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_target))); @@ -2311,12 +2309,12 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_init_ft_s2v_config(struct /* "fse/models/average_inner.pyx":99 * c[0].sentence_vectors = (np.PyArray_DATA(target)) * - * cdef object populate_base_s2v_config(BaseSentenceVecsConfig *c, vocab, indexed_sentences): # <<<<<<<<<<<<<< + * cdef object populate_base_s2v_config(BaseSentenceVecsConfig *c, wv, indexed_sentences): # <<<<<<<<<<<<<< * """Prepare C structures for BaseAny2VecModel so we can go "full C" and release the Python GIL. * */ -static PyObject *__pyx_f_3fse_6models_13average_inner_populate_base_s2v_config(struct __pyx_t_3fse_6models_13average_inner_BaseSentenceVecsConfig *__pyx_v_c, PyObject *__pyx_v_vocab, PyObject *__pyx_v_indexed_sentences) { +static PyObject *__pyx_f_3fse_6models_13average_inner_populate_base_s2v_config(struct __pyx_t_3fse_6models_13average_inner_BaseSentenceVecsConfig *__pyx_v_c, PyObject *__pyx_v_wv, PyObject *__pyx_v_indexed_sentences) { __pyx_t_3fse_6models_13average_inner_uINT_t __pyx_v_eff_words; __pyx_t_3fse_6models_13average_inner_uINT_t __pyx_v_eff_sents; PyObject *__pyx_v_obj = NULL; @@ -2432,7 +2430,7 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_base_s2v_config(s * if not obj[0]: * continue # <<<<<<<<<<<<<< * for token in obj[0]: - * word = vocab[token] if token in vocab else None # Vocab obj + * word = token if token in wv.key_to_index else None */ goto __pyx_L3_continue; @@ -2449,7 +2447,7 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_base_s2v_config(s * if not obj[0]: * continue * for token in obj[0]: # <<<<<<<<<<<<<< - * word = vocab[token] if token in vocab else None # Vocab obj + * word = token if token in wv.key_to_index else None * if word is None: */ __pyx_t_4 = __Pyx_GetItemInt(__pyx_v_obj, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 131, __pyx_L1_error) @@ -2500,16 +2498,17 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_base_s2v_config(s /* "fse/models/average_inner.pyx":132 * continue * for token in obj[0]: - * word = vocab[token] if token in vocab else None # Vocab obj # <<<<<<<<<<<<<< + * word = token if token in wv.key_to_index else None # <<<<<<<<<<<<<< * if word is None: * continue */ - __pyx_t_6 = (__Pyx_PySequence_ContainsTF(__pyx_v_token, __pyx_v_vocab, Py_EQ)); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(0, 132, __pyx_L1_error) + __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_wv, __pyx_n_s_key_to_index); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 132, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_10); + __pyx_t_6 = (__Pyx_PySequence_ContainsTF(__pyx_v_token, __pyx_t_10, Py_EQ)); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(0, 132, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; if ((__pyx_t_6 != 0)) { - __pyx_t_10 = __Pyx_PyObject_GetItem(__pyx_v_vocab, __pyx_v_token); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 132, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_10); - __pyx_t_4 = __pyx_t_10; - __pyx_t_10 = 0; + __Pyx_INCREF(__pyx_v_token); + __pyx_t_4 = __pyx_v_token; } else { __Pyx_INCREF(Py_None); __pyx_t_4 = Py_None; @@ -2519,57 +2518,60 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_base_s2v_config(s /* "fse/models/average_inner.pyx":133 * for token in obj[0]: - * word = vocab[token] if token in vocab else None # Vocab obj + * word = token if token in wv.key_to_index else None * if word is None: # <<<<<<<<<<<<<< * continue - * c.word_indices[eff_words] = word.index + * c.word_indices[eff_words] = wv.key_to_index[token] */ __pyx_t_6 = (__pyx_v_word == Py_None); __pyx_t_5 = (__pyx_t_6 != 0); if (__pyx_t_5) { /* "fse/models/average_inner.pyx":134 - * word = vocab[token] if token in vocab else None # Vocab obj + * word = token if token in wv.key_to_index else None * if word is None: * continue # <<<<<<<<<<<<<< - * c.word_indices[eff_words] = word.index + * c.word_indices[eff_words] = wv.key_to_index[token] * c.sent_adresses[eff_words] = obj[1] */ goto __pyx_L6_continue; /* "fse/models/average_inner.pyx":133 * for token in obj[0]: - * word = vocab[token] if token in vocab else None # Vocab obj + * word = token if token in wv.key_to_index else None * if word is None: # <<<<<<<<<<<<<< * continue - * c.word_indices[eff_words] = word.index + * c.word_indices[eff_words] = wv.key_to_index[token] */ } /* "fse/models/average_inner.pyx":135 * if word is None: * continue - * c.word_indices[eff_words] = word.index # <<<<<<<<<<<<<< + * c.word_indices[eff_words] = wv.key_to_index[token] # <<<<<<<<<<<<<< * c.sent_adresses[eff_words] = obj[1] * */ - __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_word, __pyx_n_s_index); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 135, __pyx_L1_error) + __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_wv, __pyx_n_s_key_to_index); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 135, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); - __pyx_t_11 = __Pyx_PyInt_As_npy_uint32(__pyx_t_4); if (unlikely((__pyx_t_11 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 135, __pyx_L1_error) + __pyx_t_10 = __Pyx_PyObject_GetItem(__pyx_t_4, __pyx_v_token); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 135, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_10); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __pyx_t_11 = __Pyx_PyInt_As_npy_uint32(__pyx_t_10); if (unlikely((__pyx_t_11 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 135, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; (__pyx_v_c->word_indices[__pyx_v_eff_words]) = ((__pyx_t_3fse_6models_13average_inner_uINT_t)__pyx_t_11); /* "fse/models/average_inner.pyx":136 * continue - * c.word_indices[eff_words] = word.index + * c.word_indices[eff_words] = wv.key_to_index[token] * c.sent_adresses[eff_words] = obj[1] # <<<<<<<<<<<<<< * * eff_words += ONE */ - __pyx_t_4 = __Pyx_GetItemInt(__pyx_v_obj, 1, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 136, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_4); - __pyx_t_11 = __Pyx_PyInt_As_npy_uint32(__pyx_t_4); if (unlikely((__pyx_t_11 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 136, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __pyx_t_10 = __Pyx_GetItemInt(__pyx_v_obj, 1, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 136, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_10); + __pyx_t_11 = __Pyx_PyInt_As_npy_uint32(__pyx_t_10); if (unlikely((__pyx_t_11 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 136, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; (__pyx_v_c->sent_adresses[__pyx_v_eff_words]) = ((__pyx_t_3fse_6models_13average_inner_uINT_t)__pyx_t_11); /* "fse/models/average_inner.pyx":138 @@ -2613,7 +2615,7 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_base_s2v_config(s * if not obj[0]: * continue * for token in obj[0]: # <<<<<<<<<<<<<< - * word = vocab[token] if token in vocab else None # Vocab obj + * word = token if token in wv.key_to_index else None * if word is None: */ __pyx_L6_continue:; @@ -2684,29 +2686,29 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_base_s2v_config(s * * return eff_sents, eff_words # <<<<<<<<<<<<<< * - * cdef object populate_ft_s2v_config(FTSentenceVecsConfig *c, vocab, indexed_sentences): + * cdef object populate_ft_s2v_config(FTSentenceVecsConfig *c, wv, indexed_sentences): */ __Pyx_XDECREF(__pyx_r); __pyx_t_1 = __Pyx_PyInt_From_npy_uint32(__pyx_v_eff_sents); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 147, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_7 = __Pyx_PyInt_From_npy_uint32(__pyx_v_eff_words); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 147, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_7); - __pyx_t_4 = PyTuple_New(2); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 147, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_4); + __pyx_t_10 = PyTuple_New(2); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 147, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_10); __Pyx_GIVEREF(__pyx_t_1); - PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_1); + PyTuple_SET_ITEM(__pyx_t_10, 0, __pyx_t_1); __Pyx_GIVEREF(__pyx_t_7); - PyTuple_SET_ITEM(__pyx_t_4, 1, __pyx_t_7); + PyTuple_SET_ITEM(__pyx_t_10, 1, __pyx_t_7); __pyx_t_1 = 0; __pyx_t_7 = 0; - __pyx_r = __pyx_t_4; - __pyx_t_4 = 0; + __pyx_r = __pyx_t_10; + __pyx_t_10 = 0; goto __pyx_L0; /* "fse/models/average_inner.pyx":99 * c[0].sentence_vectors = (np.PyArray_DATA(target)) * - * cdef object populate_base_s2v_config(BaseSentenceVecsConfig *c, vocab, indexed_sentences): # <<<<<<<<<<<<<< + * cdef object populate_base_s2v_config(BaseSentenceVecsConfig *c, wv, indexed_sentences): # <<<<<<<<<<<<<< * """Prepare C structures for BaseAny2VecModel so we can go "full C" and release the Python GIL. * */ @@ -2731,17 +2733,16 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_base_s2v_config(s /* "fse/models/average_inner.pyx":149 * return eff_sents, eff_words * - * cdef object populate_ft_s2v_config(FTSentenceVecsConfig *c, vocab, indexed_sentences): # <<<<<<<<<<<<<< + * cdef object populate_ft_s2v_config(FTSentenceVecsConfig *c, wv, indexed_sentences): # <<<<<<<<<<<<<< * """Prepare C structures for FastText so we can go "full C" and release the Python GIL. * */ -static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(struct __pyx_t_3fse_6models_13average_inner_FTSentenceVecsConfig *__pyx_v_c, PyObject *__pyx_v_vocab, PyObject *__pyx_v_indexed_sentences) { +static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(struct __pyx_t_3fse_6models_13average_inner_FTSentenceVecsConfig *__pyx_v_c, PyObject *__pyx_v_wv, PyObject *__pyx_v_indexed_sentences) { __pyx_t_3fse_6models_13average_inner_uINT_t __pyx_v_eff_words; __pyx_t_3fse_6models_13average_inner_uINT_t __pyx_v_eff_sents; PyObject *__pyx_v_obj = NULL; PyObject *__pyx_v_token = NULL; - PyObject *__pyx_v_word = NULL; PyObject *__pyx_v_encoded_ngrams = NULL; PyObject *__pyx_v_hashes = NULL; PyObject *__pyx_v_i = NULL; @@ -2884,7 +2885,7 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str * continue * for token in obj[0]: # <<<<<<<<<<<<<< * c.sent_adresses[eff_words] = obj[1] - * if token in vocab: + * if token in wv.key_to_index: */ __pyx_t_4 = __Pyx_GetItemInt(__pyx_v_obj, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 181, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); @@ -2935,7 +2936,7 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str * continue * for token in obj[0]: * c.sent_adresses[eff_words] = obj[1] # <<<<<<<<<<<<<< - * if token in vocab: + * if token in wv.key_to_index: * # In Vocabulary */ __pyx_t_4 = __Pyx_GetItemInt(__pyx_v_obj, 1, long, 1, __Pyx_PyInt_From_long, 0, 0, 0); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 182, __pyx_L1_error) @@ -2947,42 +2948,36 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str /* "fse/models/average_inner.pyx":183 * for token in obj[0]: * c.sent_adresses[eff_words] = obj[1] - * if token in vocab: # <<<<<<<<<<<<<< + * if token in wv.key_to_index: # <<<<<<<<<<<<<< * # In Vocabulary - * word = vocab[token] + * c.word_indices[eff_words] = wv.key_to_index[token] */ - __pyx_t_6 = (__Pyx_PySequence_ContainsTF(__pyx_v_token, __pyx_v_vocab, Py_EQ)); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(0, 183, __pyx_L1_error) + __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_wv, __pyx_n_s_key_to_index); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 183, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_6 = (__Pyx_PySequence_ContainsTF(__pyx_v_token, __pyx_t_4, Py_EQ)); if (unlikely(__pyx_t_6 < 0)) __PYX_ERR(0, 183, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; __pyx_t_5 = (__pyx_t_6 != 0); if (__pyx_t_5) { /* "fse/models/average_inner.pyx":185 - * if token in vocab: - * # In Vocabulary - * word = vocab[token] # <<<<<<<<<<<<<< - * c.word_indices[eff_words] = word.index - * c.subwords_idx_len[eff_words] = ZERO - */ - __pyx_t_4 = __Pyx_PyObject_GetItem(__pyx_v_vocab, __pyx_v_token); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 185, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_4); - __Pyx_XDECREF_SET(__pyx_v_word, __pyx_t_4); - __pyx_t_4 = 0; - - /* "fse/models/average_inner.pyx":186 + * if token in wv.key_to_index: * # In Vocabulary - * word = vocab[token] - * c.word_indices[eff_words] = word.index # <<<<<<<<<<<<<< + * c.word_indices[eff_words] = wv.key_to_index[token] # <<<<<<<<<<<<<< * c.subwords_idx_len[eff_words] = ZERO * else: */ - __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_word, __pyx_n_s_index); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 186, __pyx_L1_error) + __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_wv, __pyx_n_s_key_to_index); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 185, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); - __pyx_t_10 = __Pyx_PyInt_As_npy_uint32(__pyx_t_4); if (unlikely((__pyx_t_10 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 186, __pyx_L1_error) + __pyx_t_11 = __Pyx_PyObject_GetItem(__pyx_t_4, __pyx_v_token); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 185, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_11); __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __pyx_t_10 = __Pyx_PyInt_As_npy_uint32(__pyx_t_11); if (unlikely((__pyx_t_10 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 185, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; (__pyx_v_c->word_indices[__pyx_v_eff_words]) = ((__pyx_t_3fse_6models_13average_inner_uINT_t)__pyx_t_10); - /* "fse/models/average_inner.pyx":187 - * word = vocab[token] - * c.word_indices[eff_words] = word.index + /* "fse/models/average_inner.pyx":186 + * # In Vocabulary + * c.word_indices[eff_words] = wv.key_to_index[token] * c.subwords_idx_len[eff_words] = ZERO # <<<<<<<<<<<<<< * else: * # OOV words --> write ngram indices to memory @@ -2992,14 +2987,14 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str /* "fse/models/average_inner.pyx":183 * for token in obj[0]: * c.sent_adresses[eff_words] = obj[1] - * if token in vocab: # <<<<<<<<<<<<<< + * if token in wv.key_to_index: # <<<<<<<<<<<<<< * # In Vocabulary - * word = vocab[token] + * c.word_indices[eff_words] = wv.key_to_index[token] */ goto __pyx_L8; } - /* "fse/models/average_inner.pyx":190 + /* "fse/models/average_inner.pyx":189 * else: * # OOV words --> write ngram indices to memory * c.word_indices[eff_words] = ZERO # <<<<<<<<<<<<<< @@ -3009,53 +3004,53 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str /*else*/ { (__pyx_v_c->word_indices[__pyx_v_eff_words]) = __pyx_v_3fse_6models_13average_inner_ZERO; - /* "fse/models/average_inner.pyx":192 + /* "fse/models/average_inner.pyx":191 * c.word_indices[eff_words] = ZERO * * encoded_ngrams = compute_ngrams_bytes(token, c.min_n, c.max_n) # <<<<<<<<<<<<<< * hashes = [ft_hash_bytes(n) % c.bucket for n in encoded_ngrams] * */ - __Pyx_GetModuleGlobalName(__pyx_t_11, __pyx_n_s_compute_ngrams_bytes); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 192, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_11); - __pyx_t_12 = __Pyx_PyInt_From_int(__pyx_v_c->min_n); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 192, __pyx_L1_error) + __Pyx_GetModuleGlobalName(__pyx_t_4, __pyx_n_s_compute_ngrams_bytes); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 191, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_12 = __Pyx_PyInt_From_int(__pyx_v_c->min_n); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 191, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_12); - __pyx_t_13 = __Pyx_PyInt_From_int(__pyx_v_c->max_n); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 192, __pyx_L1_error) + __pyx_t_13 = __Pyx_PyInt_From_int(__pyx_v_c->max_n); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 191, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_13); __pyx_t_14 = NULL; __pyx_t_15 = 0; - if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_11))) { - __pyx_t_14 = PyMethod_GET_SELF(__pyx_t_11); + if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_4))) { + __pyx_t_14 = PyMethod_GET_SELF(__pyx_t_4); if (likely(__pyx_t_14)) { - PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_11); + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_4); __Pyx_INCREF(__pyx_t_14); __Pyx_INCREF(function); - __Pyx_DECREF_SET(__pyx_t_11, function); + __Pyx_DECREF_SET(__pyx_t_4, function); __pyx_t_15 = 1; } } #if CYTHON_FAST_PYCALL - if (PyFunction_Check(__pyx_t_11)) { + if (PyFunction_Check(__pyx_t_4)) { PyObject *__pyx_temp[4] = {__pyx_t_14, __pyx_v_token, __pyx_t_12, __pyx_t_13}; - __pyx_t_4 = __Pyx_PyFunction_FastCall(__pyx_t_11, __pyx_temp+1-__pyx_t_15, 3+__pyx_t_15); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 192, __pyx_L1_error) + __pyx_t_11 = __Pyx_PyFunction_FastCall(__pyx_t_4, __pyx_temp+1-__pyx_t_15, 3+__pyx_t_15); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 191, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_14); __pyx_t_14 = 0; - __Pyx_GOTREF(__pyx_t_4); + __Pyx_GOTREF(__pyx_t_11); __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; } else #endif #if CYTHON_FAST_PYCCALL - if (__Pyx_PyFastCFunction_Check(__pyx_t_11)) { + if (__Pyx_PyFastCFunction_Check(__pyx_t_4)) { PyObject *__pyx_temp[4] = {__pyx_t_14, __pyx_v_token, __pyx_t_12, __pyx_t_13}; - __pyx_t_4 = __Pyx_PyCFunction_FastCall(__pyx_t_11, __pyx_temp+1-__pyx_t_15, 3+__pyx_t_15); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 192, __pyx_L1_error) + __pyx_t_11 = __Pyx_PyCFunction_FastCall(__pyx_t_4, __pyx_temp+1-__pyx_t_15, 3+__pyx_t_15); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 191, __pyx_L1_error) __Pyx_XDECREF(__pyx_t_14); __pyx_t_14 = 0; - __Pyx_GOTREF(__pyx_t_4); + __Pyx_GOTREF(__pyx_t_11); __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; } else #endif { - __pyx_t_16 = PyTuple_New(3+__pyx_t_15); if (unlikely(!__pyx_t_16)) __PYX_ERR(0, 192, __pyx_L1_error) + __pyx_t_16 = PyTuple_New(3+__pyx_t_15); if (unlikely(!__pyx_t_16)) __PYX_ERR(0, 191, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_16); if (__pyx_t_14) { __Pyx_GIVEREF(__pyx_t_14); PyTuple_SET_ITEM(__pyx_t_16, 0, __pyx_t_14); __pyx_t_14 = NULL; @@ -3069,57 +3064,57 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str PyTuple_SET_ITEM(__pyx_t_16, 2+__pyx_t_15, __pyx_t_13); __pyx_t_12 = 0; __pyx_t_13 = 0; - __pyx_t_4 = __Pyx_PyObject_Call(__pyx_t_11, __pyx_t_16, NULL); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 192, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_4); + __pyx_t_11 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_t_16, NULL); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 191, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_11); __Pyx_DECREF(__pyx_t_16); __pyx_t_16 = 0; } - __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; - __Pyx_XDECREF_SET(__pyx_v_encoded_ngrams, __pyx_t_4); - __pyx_t_4 = 0; + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __Pyx_XDECREF_SET(__pyx_v_encoded_ngrams, __pyx_t_11); + __pyx_t_11 = 0; - /* "fse/models/average_inner.pyx":193 + /* "fse/models/average_inner.pyx":192 * * encoded_ngrams = compute_ngrams_bytes(token, c.min_n, c.max_n) * hashes = [ft_hash_bytes(n) % c.bucket for n in encoded_ngrams] # <<<<<<<<<<<<<< * * c.subwords_idx_len[eff_words] = min(len(encoded_ngrams), MAX_NGRAMS) */ - __pyx_t_4 = PyList_New(0); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 193, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_4); + __pyx_t_11 = PyList_New(0); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 192, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_11); if (likely(PyList_CheckExact(__pyx_v_encoded_ngrams)) || PyTuple_CheckExact(__pyx_v_encoded_ngrams)) { - __pyx_t_11 = __pyx_v_encoded_ngrams; __Pyx_INCREF(__pyx_t_11); __pyx_t_17 = 0; + __pyx_t_4 = __pyx_v_encoded_ngrams; __Pyx_INCREF(__pyx_t_4); __pyx_t_17 = 0; __pyx_t_18 = NULL; } else { - __pyx_t_17 = -1; __pyx_t_11 = PyObject_GetIter(__pyx_v_encoded_ngrams); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 193, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_11); - __pyx_t_18 = Py_TYPE(__pyx_t_11)->tp_iternext; if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 193, __pyx_L1_error) + __pyx_t_17 = -1; __pyx_t_4 = PyObject_GetIter(__pyx_v_encoded_ngrams); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 192, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_18 = Py_TYPE(__pyx_t_4)->tp_iternext; if (unlikely(!__pyx_t_18)) __PYX_ERR(0, 192, __pyx_L1_error) } for (;;) { if (likely(!__pyx_t_18)) { - if (likely(PyList_CheckExact(__pyx_t_11))) { - if (__pyx_t_17 >= PyList_GET_SIZE(__pyx_t_11)) break; + if (likely(PyList_CheckExact(__pyx_t_4))) { + if (__pyx_t_17 >= PyList_GET_SIZE(__pyx_t_4)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_16 = PyList_GET_ITEM(__pyx_t_11, __pyx_t_17); __Pyx_INCREF(__pyx_t_16); __pyx_t_17++; if (unlikely(0 < 0)) __PYX_ERR(0, 193, __pyx_L1_error) + __pyx_t_16 = PyList_GET_ITEM(__pyx_t_4, __pyx_t_17); __Pyx_INCREF(__pyx_t_16); __pyx_t_17++; if (unlikely(0 < 0)) __PYX_ERR(0, 192, __pyx_L1_error) #else - __pyx_t_16 = PySequence_ITEM(__pyx_t_11, __pyx_t_17); __pyx_t_17++; if (unlikely(!__pyx_t_16)) __PYX_ERR(0, 193, __pyx_L1_error) + __pyx_t_16 = PySequence_ITEM(__pyx_t_4, __pyx_t_17); __pyx_t_17++; if (unlikely(!__pyx_t_16)) __PYX_ERR(0, 192, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_16); #endif } else { - if (__pyx_t_17 >= PyTuple_GET_SIZE(__pyx_t_11)) break; + if (__pyx_t_17 >= PyTuple_GET_SIZE(__pyx_t_4)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_16 = PyTuple_GET_ITEM(__pyx_t_11, __pyx_t_17); __Pyx_INCREF(__pyx_t_16); __pyx_t_17++; if (unlikely(0 < 0)) __PYX_ERR(0, 193, __pyx_L1_error) + __pyx_t_16 = PyTuple_GET_ITEM(__pyx_t_4, __pyx_t_17); __Pyx_INCREF(__pyx_t_16); __pyx_t_17++; if (unlikely(0 < 0)) __PYX_ERR(0, 192, __pyx_L1_error) #else - __pyx_t_16 = PySequence_ITEM(__pyx_t_11, __pyx_t_17); __pyx_t_17++; if (unlikely(!__pyx_t_16)) __PYX_ERR(0, 193, __pyx_L1_error) + __pyx_t_16 = PySequence_ITEM(__pyx_t_4, __pyx_t_17); __pyx_t_17++; if (unlikely(!__pyx_t_16)) __PYX_ERR(0, 192, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_16); #endif } } else { - __pyx_t_16 = __pyx_t_18(__pyx_t_11); + __pyx_t_16 = __pyx_t_18(__pyx_t_4); if (unlikely(!__pyx_t_16)) { PyObject* exc_type = PyErr_Occurred(); if (exc_type) { if (likely(__Pyx_PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); - else __PYX_ERR(0, 193, __pyx_L1_error) + else __PYX_ERR(0, 192, __pyx_L1_error) } break; } @@ -3127,7 +3122,7 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str } __Pyx_XDECREF_SET(__pyx_v_n, __pyx_t_16); __pyx_t_16 = 0; - __Pyx_GetModuleGlobalName(__pyx_t_13, __pyx_n_s_ft_hash_bytes); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 193, __pyx_L1_error) + __Pyx_GetModuleGlobalName(__pyx_t_13, __pyx_n_s_ft_hash_bytes); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 192, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_13); __pyx_t_12 = NULL; if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_13))) { @@ -3141,23 +3136,23 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str } __pyx_t_16 = (__pyx_t_12) ? __Pyx_PyObject_Call2Args(__pyx_t_13, __pyx_t_12, __pyx_v_n) : __Pyx_PyObject_CallOneArg(__pyx_t_13, __pyx_v_n); __Pyx_XDECREF(__pyx_t_12); __pyx_t_12 = 0; - if (unlikely(!__pyx_t_16)) __PYX_ERR(0, 193, __pyx_L1_error) + if (unlikely(!__pyx_t_16)) __PYX_ERR(0, 192, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_16); __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; - __pyx_t_13 = __Pyx_PyInt_From_int(__pyx_v_c->bucket); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 193, __pyx_L1_error) + __pyx_t_13 = __Pyx_PyInt_From_int(__pyx_v_c->bucket); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 192, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_13); - __pyx_t_12 = PyNumber_Remainder(__pyx_t_16, __pyx_t_13); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 193, __pyx_L1_error) + __pyx_t_12 = PyNumber_Remainder(__pyx_t_16, __pyx_t_13); if (unlikely(!__pyx_t_12)) __PYX_ERR(0, 192, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_12); __Pyx_DECREF(__pyx_t_16); __pyx_t_16 = 0; __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; - if (unlikely(__Pyx_ListComp_Append(__pyx_t_4, (PyObject*)__pyx_t_12))) __PYX_ERR(0, 193, __pyx_L1_error) + if (unlikely(__Pyx_ListComp_Append(__pyx_t_11, (PyObject*)__pyx_t_12))) __PYX_ERR(0, 192, __pyx_L1_error) __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; } - __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; - __Pyx_XDECREF_SET(__pyx_v_hashes, ((PyObject*)__pyx_t_4)); - __pyx_t_4 = 0; + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __Pyx_XDECREF_SET(__pyx_v_hashes, ((PyObject*)__pyx_t_11)); + __pyx_t_11 = 0; - /* "fse/models/average_inner.pyx":195 + /* "fse/models/average_inner.pyx":194 * hashes = [ft_hash_bytes(n) % c.bucket for n in encoded_ngrams] * * c.subwords_idx_len[eff_words] = min(len(encoded_ngrams), MAX_NGRAMS) # <<<<<<<<<<<<<< @@ -3165,7 +3160,7 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str * c.subwords_idx[(eff_words * MAX_NGRAMS) + i] = h */ __pyx_t_19 = 40; - __pyx_t_17 = PyObject_Length(__pyx_v_encoded_ngrams); if (unlikely(__pyx_t_17 == ((Py_ssize_t)-1))) __PYX_ERR(0, 195, __pyx_L1_error) + __pyx_t_17 = PyObject_Length(__pyx_v_encoded_ngrams); if (unlikely(__pyx_t_17 == ((Py_ssize_t)-1))) __PYX_ERR(0, 194, __pyx_L1_error) if (((__pyx_t_19 < __pyx_t_17) != 0)) { __pyx_t_20 = __pyx_t_19; } else { @@ -3173,7 +3168,7 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str } (__pyx_v_c->subwords_idx_len[__pyx_v_eff_words]) = ((__pyx_t_3fse_6models_13average_inner_uINT_t)__pyx_t_20); - /* "fse/models/average_inner.pyx":196 + /* "fse/models/average_inner.pyx":195 * * c.subwords_idx_len[eff_words] = min(len(encoded_ngrams), MAX_NGRAMS) * for i, h in enumerate(hashes[:MAX_NGRAMS]): # <<<<<<<<<<<<<< @@ -3181,47 +3176,47 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str * */ __Pyx_INCREF(__pyx_int_0); - __pyx_t_4 = __pyx_int_0; - __pyx_t_11 = __Pyx_PyList_GetSlice(__pyx_v_hashes, 0, 40); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 196, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_11); - __pyx_t_12 = __pyx_t_11; __Pyx_INCREF(__pyx_t_12); __pyx_t_20 = 0; - __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; + __pyx_t_11 = __pyx_int_0; + __pyx_t_4 = __Pyx_PyList_GetSlice(__pyx_v_hashes, 0, 40); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 195, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_12 = __pyx_t_4; __Pyx_INCREF(__pyx_t_12); __pyx_t_20 = 0; + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; for (;;) { if (__pyx_t_20 >= PyList_GET_SIZE(__pyx_t_12)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_11 = PyList_GET_ITEM(__pyx_t_12, __pyx_t_20); __Pyx_INCREF(__pyx_t_11); __pyx_t_20++; if (unlikely(0 < 0)) __PYX_ERR(0, 196, __pyx_L1_error) + __pyx_t_4 = PyList_GET_ITEM(__pyx_t_12, __pyx_t_20); __Pyx_INCREF(__pyx_t_4); __pyx_t_20++; if (unlikely(0 < 0)) __PYX_ERR(0, 195, __pyx_L1_error) #else - __pyx_t_11 = PySequence_ITEM(__pyx_t_12, __pyx_t_20); __pyx_t_20++; if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 196, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_11); + __pyx_t_4 = PySequence_ITEM(__pyx_t_12, __pyx_t_20); __pyx_t_20++; if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 195, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); #endif - __Pyx_XDECREF_SET(__pyx_v_h, __pyx_t_11); - __pyx_t_11 = 0; - __Pyx_INCREF(__pyx_t_4); - __Pyx_XDECREF_SET(__pyx_v_i, __pyx_t_4); - __pyx_t_11 = __Pyx_PyInt_AddObjC(__pyx_t_4, __pyx_int_1, 1, 0, 0); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 196, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_11); - __Pyx_DECREF(__pyx_t_4); - __pyx_t_4 = __pyx_t_11; - __pyx_t_11 = 0; + __Pyx_XDECREF_SET(__pyx_v_h, __pyx_t_4); + __pyx_t_4 = 0; + __Pyx_INCREF(__pyx_t_11); + __Pyx_XDECREF_SET(__pyx_v_i, __pyx_t_11); + __pyx_t_4 = __Pyx_PyInt_AddObjC(__pyx_t_11, __pyx_int_1, 1, 0, 0); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 195, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __Pyx_DECREF(__pyx_t_11); + __pyx_t_11 = __pyx_t_4; + __pyx_t_4 = 0; - /* "fse/models/average_inner.pyx":197 + /* "fse/models/average_inner.pyx":196 * c.subwords_idx_len[eff_words] = min(len(encoded_ngrams), MAX_NGRAMS) * for i, h in enumerate(hashes[:MAX_NGRAMS]): * c.subwords_idx[(eff_words * MAX_NGRAMS) + i] = h # <<<<<<<<<<<<<< * * eff_words += ONE */ - __pyx_t_10 = __Pyx_PyInt_As_npy_uint32(__pyx_v_h); if (unlikely((__pyx_t_10 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 197, __pyx_L1_error) - __pyx_t_11 = __Pyx_PyInt_From_long((__pyx_v_eff_words * 40)); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 197, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_11); - __pyx_t_13 = PyNumber_Add(__pyx_t_11, __pyx_v_i); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 197, __pyx_L1_error) + __pyx_t_10 = __Pyx_PyInt_As_npy_uint32(__pyx_v_h); if (unlikely((__pyx_t_10 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 196, __pyx_L1_error) + __pyx_t_4 = __Pyx_PyInt_From_long((__pyx_v_eff_words * 40)); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 196, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_13 = PyNumber_Add(__pyx_t_4, __pyx_v_i); if (unlikely(!__pyx_t_13)) __PYX_ERR(0, 196, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_13); - __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; - __pyx_t_17 = __Pyx_PyIndex_AsSsize_t(__pyx_t_13); if (unlikely((__pyx_t_17 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(0, 197, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __pyx_t_17 = __Pyx_PyIndex_AsSsize_t(__pyx_t_13); if (unlikely((__pyx_t_17 == (Py_ssize_t)-1) && PyErr_Occurred())) __PYX_ERR(0, 196, __pyx_L1_error) __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0; (__pyx_v_c->subwords_idx[__pyx_t_17]) = ((__pyx_t_3fse_6models_13average_inner_uINT_t)__pyx_t_10); - /* "fse/models/average_inner.pyx":196 + /* "fse/models/average_inner.pyx":195 * * c.subwords_idx_len[eff_words] = min(len(encoded_ngrams), MAX_NGRAMS) * for i, h in enumerate(hashes[:MAX_NGRAMS]): # <<<<<<<<<<<<<< @@ -3230,11 +3225,11 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str */ } __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0; - __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0; } __pyx_L8:; - /* "fse/models/average_inner.pyx":199 + /* "fse/models/average_inner.pyx":198 * c.subwords_idx[(eff_words * MAX_NGRAMS) + i] = h * * eff_words += ONE # <<<<<<<<<<<<<< @@ -3243,7 +3238,7 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str */ __pyx_v_eff_words = (__pyx_v_eff_words + __pyx_v_3fse_6models_13average_inner_ONE); - /* "fse/models/average_inner.pyx":201 + /* "fse/models/average_inner.pyx":200 * eff_words += ONE * * if eff_words == MAX_WORDS: # <<<<<<<<<<<<<< @@ -3253,7 +3248,7 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str __pyx_t_5 = ((__pyx_v_eff_words == 0x2710) != 0); if (__pyx_t_5) { - /* "fse/models/average_inner.pyx":202 + /* "fse/models/average_inner.pyx":201 * * if eff_words == MAX_WORDS: * break # <<<<<<<<<<<<<< @@ -3262,7 +3257,7 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str */ goto __pyx_L7_break; - /* "fse/models/average_inner.pyx":201 + /* "fse/models/average_inner.pyx":200 * eff_words += ONE * * if eff_words == MAX_WORDS: # <<<<<<<<<<<<<< @@ -3276,13 +3271,13 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str * continue * for token in obj[0]: # <<<<<<<<<<<<<< * c.sent_adresses[eff_words] = obj[1] - * if token in vocab: + * if token in wv.key_to_index: */ } __pyx_L7_break:; __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; - /* "fse/models/average_inner.pyx":204 + /* "fse/models/average_inner.pyx":203 * break * * eff_sents += 1 # <<<<<<<<<<<<<< @@ -3291,7 +3286,7 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str */ __pyx_v_eff_sents = (__pyx_v_eff_sents + 1); - /* "fse/models/average_inner.pyx":205 + /* "fse/models/average_inner.pyx":204 * * eff_sents += 1 * c.sentence_boundary[eff_sents] = eff_words # <<<<<<<<<<<<<< @@ -3300,7 +3295,7 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str */ (__pyx_v_c->sentence_boundary[__pyx_v_eff_sents]) = __pyx_v_eff_words; - /* "fse/models/average_inner.pyx":207 + /* "fse/models/average_inner.pyx":206 * c.sentence_boundary[eff_sents] = eff_words * * if eff_words == MAX_WORDS: # <<<<<<<<<<<<<< @@ -3310,7 +3305,7 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str __pyx_t_5 = ((__pyx_v_eff_words == 0x2710) != 0); if (__pyx_t_5) { - /* "fse/models/average_inner.pyx":208 + /* "fse/models/average_inner.pyx":207 * * if eff_words == MAX_WORDS: * break # <<<<<<<<<<<<<< @@ -3319,7 +3314,7 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str */ goto __pyx_L4_break; - /* "fse/models/average_inner.pyx":207 + /* "fse/models/average_inner.pyx":206 * c.sentence_boundary[eff_sents] = eff_words * * if eff_words == MAX_WORDS: # <<<<<<<<<<<<<< @@ -3340,7 +3335,7 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str __pyx_L4_break:; __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "fse/models/average_inner.pyx":210 + /* "fse/models/average_inner.pyx":209 * break * * return eff_sents, eff_words # <<<<<<<<<<<<<< @@ -3348,26 +3343,26 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str * cdef void compute_base_sentence_averages(BaseSentenceVecsConfig *c, uINT_t num_sentences) nogil: */ __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = __Pyx_PyInt_From_npy_uint32(__pyx_v_eff_sents); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 210, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyInt_From_npy_uint32(__pyx_v_eff_sents); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 209, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_7 = __Pyx_PyInt_From_npy_uint32(__pyx_v_eff_words); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 210, __pyx_L1_error) + __pyx_t_7 = __Pyx_PyInt_From_npy_uint32(__pyx_v_eff_words); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 209, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_7); - __pyx_t_4 = PyTuple_New(2); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 210, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_4); + __pyx_t_11 = PyTuple_New(2); if (unlikely(!__pyx_t_11)) __PYX_ERR(0, 209, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_11); __Pyx_GIVEREF(__pyx_t_1); - PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_1); + PyTuple_SET_ITEM(__pyx_t_11, 0, __pyx_t_1); __Pyx_GIVEREF(__pyx_t_7); - PyTuple_SET_ITEM(__pyx_t_4, 1, __pyx_t_7); + PyTuple_SET_ITEM(__pyx_t_11, 1, __pyx_t_7); __pyx_t_1 = 0; __pyx_t_7 = 0; - __pyx_r = __pyx_t_4; - __pyx_t_4 = 0; + __pyx_r = __pyx_t_11; + __pyx_t_11 = 0; goto __pyx_L0; /* "fse/models/average_inner.pyx":149 * return eff_sents, eff_words * - * cdef object populate_ft_s2v_config(FTSentenceVecsConfig *c, vocab, indexed_sentences): # <<<<<<<<<<<<<< + * cdef object populate_ft_s2v_config(FTSentenceVecsConfig *c, wv, indexed_sentences): # <<<<<<<<<<<<<< * """Prepare C structures for FastText so we can go "full C" and release the Python GIL. * */ @@ -3387,7 +3382,6 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str __pyx_L0:; __Pyx_XDECREF(__pyx_v_obj); __Pyx_XDECREF(__pyx_v_token); - __Pyx_XDECREF(__pyx_v_word); __Pyx_XDECREF(__pyx_v_encoded_ngrams); __Pyx_XDECREF(__pyx_v_hashes); __Pyx_XDECREF(__pyx_v_i); @@ -3398,7 +3392,7 @@ static PyObject *__pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config(str return __pyx_r; } -/* "fse/models/average_inner.pyx":212 +/* "fse/models/average_inner.pyx":211 * return eff_sents, eff_words * * cdef void compute_base_sentence_averages(BaseSentenceVecsConfig *c, uINT_t num_sentences) nogil: # <<<<<<<<<<<<<< @@ -3426,7 +3420,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_base_sentence_averages( __pyx_t_3fse_6models_13average_inner_uINT_t __pyx_t_7; int __pyx_t_8; - /* "fse/models/average_inner.pyx":228 + /* "fse/models/average_inner.pyx":227 * """ * cdef: * int size = c.size # <<<<<<<<<<<<<< @@ -3436,7 +3430,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_base_sentence_averages( __pyx_t_1 = __pyx_v_c->size; __pyx_v_size = __pyx_t_1; - /* "fse/models/average_inner.pyx":236 + /* "fse/models/average_inner.pyx":235 * REAL_t sent_len, inv_count * * for sent_idx in range(num_sentences): # <<<<<<<<<<<<<< @@ -3448,7 +3442,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_base_sentence_averages( for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) { __pyx_v_sent_idx = __pyx_t_4; - /* "fse/models/average_inner.pyx":237 + /* "fse/models/average_inner.pyx":236 * * for sent_idx in range(num_sentences): * memset(c.mem, 0, size * cython.sizeof(REAL_t)) # <<<<<<<<<<<<<< @@ -3457,7 +3451,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_base_sentence_averages( */ (void)(memset(__pyx_v_c->mem, 0, (__pyx_v_size * (sizeof(__pyx_t_3fse_6models_13average_inner_REAL_t))))); - /* "fse/models/average_inner.pyx":239 + /* "fse/models/average_inner.pyx":238 * memset(c.mem, 0, size * cython.sizeof(REAL_t)) * * sent_start = c.sentence_boundary[sent_idx] # <<<<<<<<<<<<<< @@ -3466,7 +3460,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_base_sentence_averages( */ __pyx_v_sent_start = (__pyx_v_c->sentence_boundary[__pyx_v_sent_idx]); - /* "fse/models/average_inner.pyx":240 + /* "fse/models/average_inner.pyx":239 * * sent_start = c.sentence_boundary[sent_idx] * sent_end = c.sentence_boundary[sent_idx + 1] # <<<<<<<<<<<<<< @@ -3475,7 +3469,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_base_sentence_averages( */ __pyx_v_sent_end = (__pyx_v_c->sentence_boundary[(__pyx_v_sent_idx + 1)]); - /* "fse/models/average_inner.pyx":241 + /* "fse/models/average_inner.pyx":240 * sent_start = c.sentence_boundary[sent_idx] * sent_end = c.sentence_boundary[sent_idx + 1] * sent_len = ZEROF # <<<<<<<<<<<<<< @@ -3484,7 +3478,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_base_sentence_averages( */ __pyx_v_sent_len = __pyx_v_3fse_6models_13average_inner_ZEROF; - /* "fse/models/average_inner.pyx":243 + /* "fse/models/average_inner.pyx":242 * sent_len = ZEROF * * for i in range(sent_start, sent_end): # <<<<<<<<<<<<<< @@ -3496,7 +3490,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_base_sentence_averages( for (__pyx_t_7 = __pyx_v_sent_start; __pyx_t_7 < __pyx_t_6; __pyx_t_7+=1) { __pyx_v_i = __pyx_t_7; - /* "fse/models/average_inner.pyx":244 + /* "fse/models/average_inner.pyx":243 * * for i in range(sent_start, sent_end): * sent_len += ONEF # <<<<<<<<<<<<<< @@ -3505,7 +3499,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_base_sentence_averages( */ __pyx_v_sent_len = (__pyx_v_sent_len + __pyx_v_3fse_6models_13average_inner_ONEF); - /* "fse/models/average_inner.pyx":245 + /* "fse/models/average_inner.pyx":244 * for i in range(sent_start, sent_end): * sent_len += ONEF * sent_row = c.sent_adresses[i] * size # <<<<<<<<<<<<<< @@ -3514,7 +3508,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_base_sentence_averages( */ __pyx_v_sent_row = ((__pyx_v_c->sent_adresses[__pyx_v_i]) * __pyx_v_size); - /* "fse/models/average_inner.pyx":246 + /* "fse/models/average_inner.pyx":245 * sent_len += ONEF * sent_row = c.sent_adresses[i] * size * word_row = c.word_indices[i] * size # <<<<<<<<<<<<<< @@ -3523,7 +3517,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_base_sentence_averages( */ __pyx_v_word_row = ((__pyx_v_c->word_indices[__pyx_v_i]) * __pyx_v_size); - /* "fse/models/average_inner.pyx":247 + /* "fse/models/average_inner.pyx":246 * sent_row = c.sent_adresses[i] * size * word_row = c.word_indices[i] * size * word_idx = c.word_indices[i] # <<<<<<<<<<<<<< @@ -3532,7 +3526,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_base_sentence_averages( */ __pyx_v_word_idx = (__pyx_v_c->word_indices[__pyx_v_i]); - /* "fse/models/average_inner.pyx":249 + /* "fse/models/average_inner.pyx":248 * word_idx = c.word_indices[i] * * saxpy(&size, &c.word_weights[word_idx], &c.word_vectors[word_row], &ONE, c.mem, &ONE) # <<<<<<<<<<<<<< @@ -3542,7 +3536,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_base_sentence_averages( __pyx_v_3fse_6models_13average_inner_saxpy((&__pyx_v_size), (&(__pyx_v_c->word_weights[__pyx_v_word_idx])), (&(__pyx_v_c->word_vectors[__pyx_v_word_row])), (&__pyx_v_3fse_6models_13average_inner_ONE), __pyx_v_c->mem, (&__pyx_v_3fse_6models_13average_inner_ONE)); } - /* "fse/models/average_inner.pyx":251 + /* "fse/models/average_inner.pyx":250 * saxpy(&size, &c.word_weights[word_idx], &c.word_vectors[word_row], &ONE, c.mem, &ONE) * * if sent_len > ZEROF: # <<<<<<<<<<<<<< @@ -3552,7 +3546,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_base_sentence_averages( __pyx_t_8 = ((__pyx_v_sent_len > __pyx_v_3fse_6models_13average_inner_ZEROF) != 0); if (__pyx_t_8) { - /* "fse/models/average_inner.pyx":252 + /* "fse/models/average_inner.pyx":251 * * if sent_len > ZEROF: * inv_count = ONEF / sent_len # <<<<<<<<<<<<<< @@ -3561,7 +3555,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_base_sentence_averages( */ __pyx_v_inv_count = (__pyx_v_3fse_6models_13average_inner_ONEF / __pyx_v_sent_len); - /* "fse/models/average_inner.pyx":255 + /* "fse/models/average_inner.pyx":254 * # If we perform the a*x on memory, the computation is compatible with many-to-one mappings * # because it doesn't rescale the overall result * saxpy(&size, &inv_count, c.mem, &ONE, &c.sentence_vectors[sent_row], &ONE) # <<<<<<<<<<<<<< @@ -3570,7 +3564,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_base_sentence_averages( */ __pyx_v_3fse_6models_13average_inner_saxpy((&__pyx_v_size), (&__pyx_v_inv_count), __pyx_v_c->mem, (&__pyx_v_3fse_6models_13average_inner_ONE), (&(__pyx_v_c->sentence_vectors[__pyx_v_sent_row])), (&__pyx_v_3fse_6models_13average_inner_ONE)); - /* "fse/models/average_inner.pyx":251 + /* "fse/models/average_inner.pyx":250 * saxpy(&size, &c.word_weights[word_idx], &c.word_vectors[word_row], &ONE, c.mem, &ONE) * * if sent_len > ZEROF: # <<<<<<<<<<<<<< @@ -3580,7 +3574,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_base_sentence_averages( } } - /* "fse/models/average_inner.pyx":212 + /* "fse/models/average_inner.pyx":211 * return eff_sents, eff_words * * cdef void compute_base_sentence_averages(BaseSentenceVecsConfig *c, uINT_t num_sentences) nogil: # <<<<<<<<<<<<<< @@ -3591,7 +3585,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_base_sentence_averages( /* function exit code */ } -/* "fse/models/average_inner.pyx":257 +/* "fse/models/average_inner.pyx":256 * saxpy(&size, &inv_count, c.mem, &ONE, &c.sentence_vectors[sent_row], &ONE) * * cdef void compute_ft_sentence_averages(FTSentenceVecsConfig *c, uINT_t num_sentences) nogil: # <<<<<<<<<<<<<< @@ -3628,7 +3622,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st __pyx_t_3fse_6models_13average_inner_uINT_t __pyx_t_11; __pyx_t_3fse_6models_13average_inner_uINT_t __pyx_t_12; - /* "fse/models/average_inner.pyx":273 + /* "fse/models/average_inner.pyx":272 * """ * cdef: * int size = c.size # <<<<<<<<<<<<<< @@ -3638,7 +3632,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st __pyx_t_1 = __pyx_v_c->size; __pyx_v_size = __pyx_t_1; - /* "fse/models/average_inner.pyx":283 + /* "fse/models/average_inner.pyx":282 * REAL_t sent_len * REAL_t inv_count, inv_ngram * REAL_t oov_weight = c.oov_weight # <<<<<<<<<<<<<< @@ -3648,7 +3642,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st __pyx_t_2 = __pyx_v_c->oov_weight; __pyx_v_oov_weight = __pyx_t_2; - /* "fse/models/average_inner.pyx":286 + /* "fse/models/average_inner.pyx":285 * * * for sent_idx in range(num_sentences): # <<<<<<<<<<<<<< @@ -3660,7 +3654,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) { __pyx_v_sent_idx = __pyx_t_5; - /* "fse/models/average_inner.pyx":287 + /* "fse/models/average_inner.pyx":286 * * for sent_idx in range(num_sentences): * memset(c.mem, 0, size * cython.sizeof(REAL_t)) # <<<<<<<<<<<<<< @@ -3669,7 +3663,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st */ (void)(memset(__pyx_v_c->mem, 0, (__pyx_v_size * (sizeof(__pyx_t_3fse_6models_13average_inner_REAL_t))))); - /* "fse/models/average_inner.pyx":288 + /* "fse/models/average_inner.pyx":287 * for sent_idx in range(num_sentences): * memset(c.mem, 0, size * cython.sizeof(REAL_t)) * sent_start = c.sentence_boundary[sent_idx] # <<<<<<<<<<<<<< @@ -3678,7 +3672,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st */ __pyx_v_sent_start = (__pyx_v_c->sentence_boundary[__pyx_v_sent_idx]); - /* "fse/models/average_inner.pyx":289 + /* "fse/models/average_inner.pyx":288 * memset(c.mem, 0, size * cython.sizeof(REAL_t)) * sent_start = c.sentence_boundary[sent_idx] * sent_end = c.sentence_boundary[sent_idx + 1] # <<<<<<<<<<<<<< @@ -3687,7 +3681,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st */ __pyx_v_sent_end = (__pyx_v_c->sentence_boundary[(__pyx_v_sent_idx + 1)]); - /* "fse/models/average_inner.pyx":290 + /* "fse/models/average_inner.pyx":289 * sent_start = c.sentence_boundary[sent_idx] * sent_end = c.sentence_boundary[sent_idx + 1] * sent_len = ZEROF # <<<<<<<<<<<<<< @@ -3696,7 +3690,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st */ __pyx_v_sent_len = __pyx_v_3fse_6models_13average_inner_ZEROF; - /* "fse/models/average_inner.pyx":292 + /* "fse/models/average_inner.pyx":291 * sent_len = ZEROF * * for i in range(sent_start, sent_end): # <<<<<<<<<<<<<< @@ -3708,7 +3702,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st for (__pyx_t_8 = __pyx_v_sent_start; __pyx_t_8 < __pyx_t_7; __pyx_t_8+=1) { __pyx_v_i = __pyx_t_8; - /* "fse/models/average_inner.pyx":293 + /* "fse/models/average_inner.pyx":292 * * for i in range(sent_start, sent_end): * sent_len += ONEF # <<<<<<<<<<<<<< @@ -3717,7 +3711,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st */ __pyx_v_sent_len = (__pyx_v_sent_len + __pyx_v_3fse_6models_13average_inner_ONEF); - /* "fse/models/average_inner.pyx":294 + /* "fse/models/average_inner.pyx":293 * for i in range(sent_start, sent_end): * sent_len += ONEF * sent_row = c.sent_adresses[i] * size # <<<<<<<<<<<<<< @@ -3726,7 +3720,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st */ __pyx_v_sent_row = ((__pyx_v_c->sent_adresses[__pyx_v_i]) * __pyx_v_size); - /* "fse/models/average_inner.pyx":296 + /* "fse/models/average_inner.pyx":295 * sent_row = c.sent_adresses[i] * size * * word_idx = c.word_indices[i] # <<<<<<<<<<<<<< @@ -3735,7 +3729,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st */ __pyx_v_word_idx = (__pyx_v_c->word_indices[__pyx_v_i]); - /* "fse/models/average_inner.pyx":297 + /* "fse/models/average_inner.pyx":296 * * word_idx = c.word_indices[i] * ngrams = c.subwords_idx_len[i] # <<<<<<<<<<<<<< @@ -3744,7 +3738,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st */ __pyx_v_ngrams = (__pyx_v_c->subwords_idx_len[__pyx_v_i]); - /* "fse/models/average_inner.pyx":299 + /* "fse/models/average_inner.pyx":298 * ngrams = c.subwords_idx_len[i] * * if ngrams == 0: # <<<<<<<<<<<<<< @@ -3754,7 +3748,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st __pyx_t_9 = ((__pyx_v_ngrams == 0) != 0); if (__pyx_t_9) { - /* "fse/models/average_inner.pyx":300 + /* "fse/models/average_inner.pyx":299 * * if ngrams == 0: * word_row = c.word_indices[i] * size # <<<<<<<<<<<<<< @@ -3763,7 +3757,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st */ __pyx_v_word_row = ((__pyx_v_c->word_indices[__pyx_v_i]) * __pyx_v_size); - /* "fse/models/average_inner.pyx":301 + /* "fse/models/average_inner.pyx":300 * if ngrams == 0: * word_row = c.word_indices[i] * size * saxpy(&size, &c.word_weights[word_idx], &c.word_vectors[word_row], &ONE, c.mem, &ONE) # <<<<<<<<<<<<<< @@ -3772,7 +3766,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st */ __pyx_v_3fse_6models_13average_inner_saxpy((&__pyx_v_size), (&(__pyx_v_c->word_weights[__pyx_v_word_idx])), (&(__pyx_v_c->word_vectors[__pyx_v_word_row])), (&__pyx_v_3fse_6models_13average_inner_ONE), __pyx_v_c->mem, (&__pyx_v_3fse_6models_13average_inner_ONE)); - /* "fse/models/average_inner.pyx":299 + /* "fse/models/average_inner.pyx":298 * ngrams = c.subwords_idx_len[i] * * if ngrams == 0: # <<<<<<<<<<<<<< @@ -3782,7 +3776,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st goto __pyx_L7; } - /* "fse/models/average_inner.pyx":303 + /* "fse/models/average_inner.pyx":302 * saxpy(&size, &c.word_weights[word_idx], &c.word_vectors[word_row], &ONE, c.mem, &ONE) * else: * inv_ngram = (ONEF / ngrams) * c.oov_weight # <<<<<<<<<<<<<< @@ -3792,7 +3786,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st /*else*/ { __pyx_v_inv_ngram = ((__pyx_v_3fse_6models_13average_inner_ONEF / ((__pyx_t_3fse_6models_13average_inner_REAL_t)__pyx_v_ngrams)) * __pyx_v_c->oov_weight); - /* "fse/models/average_inner.pyx":304 + /* "fse/models/average_inner.pyx":303 * else: * inv_ngram = (ONEF / ngrams) * c.oov_weight * for j in range(ngrams): # <<<<<<<<<<<<<< @@ -3804,7 +3798,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st for (__pyx_t_12 = 0; __pyx_t_12 < __pyx_t_11; __pyx_t_12+=1) { __pyx_v_j = __pyx_t_12; - /* "fse/models/average_inner.pyx":305 + /* "fse/models/average_inner.pyx":304 * inv_ngram = (ONEF / ngrams) * c.oov_weight * for j in range(ngrams): * ngram_row = c.subwords_idx[(i * MAX_NGRAMS)+j] * size # <<<<<<<<<<<<<< @@ -3813,7 +3807,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st */ __pyx_v_ngram_row = ((__pyx_v_c->subwords_idx[((__pyx_v_i * 40) + __pyx_v_j)]) * __pyx_v_size); - /* "fse/models/average_inner.pyx":306 + /* "fse/models/average_inner.pyx":305 * for j in range(ngrams): * ngram_row = c.subwords_idx[(i * MAX_NGRAMS)+j] * size * saxpy(&size, &inv_ngram, &c.ngram_vectors[ngram_row], &ONE, c.mem, &ONE) # <<<<<<<<<<<<<< @@ -3826,7 +3820,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st __pyx_L7:; } - /* "fse/models/average_inner.pyx":308 + /* "fse/models/average_inner.pyx":307 * saxpy(&size, &inv_ngram, &c.ngram_vectors[ngram_row], &ONE, c.mem, &ONE) * * if sent_len > ZEROF: # <<<<<<<<<<<<<< @@ -3836,7 +3830,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st __pyx_t_9 = ((__pyx_v_sent_len > __pyx_v_3fse_6models_13average_inner_ZEROF) != 0); if (__pyx_t_9) { - /* "fse/models/average_inner.pyx":309 + /* "fse/models/average_inner.pyx":308 * * if sent_len > ZEROF: * inv_count = ONEF / sent_len # <<<<<<<<<<<<<< @@ -3845,7 +3839,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st */ __pyx_v_inv_count = (__pyx_v_3fse_6models_13average_inner_ONEF / __pyx_v_sent_len); - /* "fse/models/average_inner.pyx":310 + /* "fse/models/average_inner.pyx":309 * if sent_len > ZEROF: * inv_count = ONEF / sent_len * saxpy(&size, &inv_count, c.mem, &ONE, &c.sentence_vectors[sent_row], &ONE) # <<<<<<<<<<<<<< @@ -3854,7 +3848,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st */ __pyx_v_3fse_6models_13average_inner_saxpy((&__pyx_v_size), (&__pyx_v_inv_count), __pyx_v_c->mem, (&__pyx_v_3fse_6models_13average_inner_ONE), (&(__pyx_v_c->sentence_vectors[__pyx_v_sent_row])), (&__pyx_v_3fse_6models_13average_inner_ONE)); - /* "fse/models/average_inner.pyx":308 + /* "fse/models/average_inner.pyx":307 * saxpy(&size, &inv_ngram, &c.ngram_vectors[ngram_row], &ONE, c.mem, &ONE) * * if sent_len > ZEROF: # <<<<<<<<<<<<<< @@ -3864,7 +3858,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st } } - /* "fse/models/average_inner.pyx":257 + /* "fse/models/average_inner.pyx":256 * saxpy(&size, &inv_count, c.mem, &ONE, &c.sentence_vectors[sent_row], &ONE) * * cdef void compute_ft_sentence_averages(FTSentenceVecsConfig *c, uINT_t num_sentences) nogil: # <<<<<<<<<<<<<< @@ -3875,7 +3869,7 @@ static void __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages(st /* function exit code */ } -/* "fse/models/average_inner.pyx":312 +/* "fse/models/average_inner.pyx":311 * saxpy(&size, &inv_count, c.mem, &ONE, &c.sentence_vectors[sent_row], &ONE) * * def train_average_cy(model, indexed_sentences, target, memory): # <<<<<<<<<<<<<< @@ -3922,23 +3916,23 @@ static PyObject *__pyx_pw_3fse_6models_13average_inner_1train_average_cy(PyObjec case 1: if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_indexed_sentences)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("train_average_cy", 1, 4, 4, 1); __PYX_ERR(0, 312, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("train_average_cy", 1, 4, 4, 1); __PYX_ERR(0, 311, __pyx_L3_error) } CYTHON_FALLTHROUGH; case 2: if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_target)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("train_average_cy", 1, 4, 4, 2); __PYX_ERR(0, 312, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("train_average_cy", 1, 4, 4, 2); __PYX_ERR(0, 311, __pyx_L3_error) } CYTHON_FALLTHROUGH; case 3: if (likely((values[3] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_memory)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("train_average_cy", 1, 4, 4, 3); __PYX_ERR(0, 312, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("train_average_cy", 1, 4, 4, 3); __PYX_ERR(0, 311, __pyx_L3_error) } } if (unlikely(kw_args > 0)) { - if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "train_average_cy") < 0)) __PYX_ERR(0, 312, __pyx_L3_error) + if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "train_average_cy") < 0)) __PYX_ERR(0, 311, __pyx_L3_error) } } else if (PyTuple_GET_SIZE(__pyx_args) != 4) { goto __pyx_L5_argtuple_error; @@ -3955,7 +3949,7 @@ static PyObject *__pyx_pw_3fse_6models_13average_inner_1train_average_cy(PyObjec } goto __pyx_L4_argument_unpacking_done; __pyx_L5_argtuple_error:; - __Pyx_RaiseArgtupleInvalid("train_average_cy", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 312, __pyx_L3_error) + __Pyx_RaiseArgtupleInvalid("train_average_cy", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 311, __pyx_L3_error) __pyx_L3_error:; __Pyx_AddTraceback("fse.models.average_inner.train_average_cy", __pyx_clineno, __pyx_lineno, __pyx_filename); __Pyx_RefNannyFinishContext(); @@ -3986,7 +3980,7 @@ static PyObject *__pyx_pf_3fse_6models_13average_inner_train_average_cy(CYTHON_U __pyx_t_3fse_6models_13average_inner_uINT_t __pyx_t_9; __Pyx_RefNannySetupContext("train_average_cy", 0); - /* "fse/models/average_inner.pyx":336 + /* "fse/models/average_inner.pyx":335 * """ * * cdef uINT_t eff_sentences = 0 # <<<<<<<<<<<<<< @@ -3995,7 +3989,7 @@ static PyObject *__pyx_pf_3fse_6models_13average_inner_train_average_cy(CYTHON_U */ __pyx_v_eff_sentences = 0; - /* "fse/models/average_inner.pyx":337 + /* "fse/models/average_inner.pyx":336 * * cdef uINT_t eff_sentences = 0 * cdef uINT_t eff_words = 0 # <<<<<<<<<<<<<< @@ -4004,82 +3998,79 @@ static PyObject *__pyx_pf_3fse_6models_13average_inner_train_average_cy(CYTHON_U */ __pyx_v_eff_words = 0; - /* "fse/models/average_inner.pyx":341 + /* "fse/models/average_inner.pyx":340 * cdef FTSentenceVecsConfig ft * * if not model.is_ft: # <<<<<<<<<<<<<< * init_base_s2v_config(&w2v, model, target, memory) * */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_is_ft); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 341, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_is_ft); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 340, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 341, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_2 < 0)) __PYX_ERR(0, 340, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_t_3 = ((!__pyx_t_2) != 0); if (__pyx_t_3) { - /* "fse/models/average_inner.pyx":342 + /* "fse/models/average_inner.pyx":341 * * if not model.is_ft: * init_base_s2v_config(&w2v, model, target, memory) # <<<<<<<<<<<<<< * - * eff_sentences, eff_words = populate_base_s2v_config(&w2v, model.wv.vocab, indexed_sentences) + * eff_sentences, eff_words = populate_base_s2v_config(&w2v, model.wv, indexed_sentences) */ - __pyx_t_1 = __pyx_f_3fse_6models_13average_inner_init_base_s2v_config((&__pyx_v_w2v), __pyx_v_model, __pyx_v_target, __pyx_v_memory); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 342, __pyx_L1_error) + __pyx_t_1 = __pyx_f_3fse_6models_13average_inner_init_base_s2v_config((&__pyx_v_w2v), __pyx_v_model, __pyx_v_target, __pyx_v_memory); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 341, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "fse/models/average_inner.pyx":344 + /* "fse/models/average_inner.pyx":343 * init_base_s2v_config(&w2v, model, target, memory) * - * eff_sentences, eff_words = populate_base_s2v_config(&w2v, model.wv.vocab, indexed_sentences) # <<<<<<<<<<<<<< + * eff_sentences, eff_words = populate_base_s2v_config(&w2v, model.wv, indexed_sentences) # <<<<<<<<<<<<<< * * with nogil: */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_wv); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 344, __pyx_L1_error) + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_wv); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 343, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_vocab); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 344, __pyx_L1_error) + __pyx_t_4 = __pyx_f_3fse_6models_13average_inner_populate_base_s2v_config((&__pyx_v_w2v), __pyx_t_1, __pyx_v_indexed_sentences); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 343, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = __pyx_f_3fse_6models_13average_inner_populate_base_s2v_config((&__pyx_v_w2v), __pyx_t_4, __pyx_v_indexed_sentences); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 344, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_1); - __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; - if ((likely(PyTuple_CheckExact(__pyx_t_1))) || (PyList_CheckExact(__pyx_t_1))) { - PyObject* sequence = __pyx_t_1; + if ((likely(PyTuple_CheckExact(__pyx_t_4))) || (PyList_CheckExact(__pyx_t_4))) { + PyObject* sequence = __pyx_t_4; Py_ssize_t size = __Pyx_PySequence_SIZE(sequence); if (unlikely(size != 2)) { if (size > 2) __Pyx_RaiseTooManyValuesError(2); else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size); - __PYX_ERR(0, 344, __pyx_L1_error) + __PYX_ERR(0, 343, __pyx_L1_error) } #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS if (likely(PyTuple_CheckExact(sequence))) { - __pyx_t_4 = PyTuple_GET_ITEM(sequence, 0); + __pyx_t_1 = PyTuple_GET_ITEM(sequence, 0); __pyx_t_5 = PyTuple_GET_ITEM(sequence, 1); } else { - __pyx_t_4 = PyList_GET_ITEM(sequence, 0); + __pyx_t_1 = PyList_GET_ITEM(sequence, 0); __pyx_t_5 = PyList_GET_ITEM(sequence, 1); } - __Pyx_INCREF(__pyx_t_4); + __Pyx_INCREF(__pyx_t_1); __Pyx_INCREF(__pyx_t_5); #else - __pyx_t_4 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 344, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_4); - __pyx_t_5 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 344, __pyx_L1_error) + __pyx_t_1 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 343, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_5 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 343, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); #endif - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; } else { Py_ssize_t index = -1; - __pyx_t_6 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 344, __pyx_L1_error) + __pyx_t_6 = PyObject_GetIter(__pyx_t_4); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 343, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_6); - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; __pyx_t_7 = Py_TYPE(__pyx_t_6)->tp_iternext; - index = 0; __pyx_t_4 = __pyx_t_7(__pyx_t_6); if (unlikely(!__pyx_t_4)) goto __pyx_L4_unpacking_failed; - __Pyx_GOTREF(__pyx_t_4); + index = 0; __pyx_t_1 = __pyx_t_7(__pyx_t_6); if (unlikely(!__pyx_t_1)) goto __pyx_L4_unpacking_failed; + __Pyx_GOTREF(__pyx_t_1); index = 1; __pyx_t_5 = __pyx_t_7(__pyx_t_6); if (unlikely(!__pyx_t_5)) goto __pyx_L4_unpacking_failed; __Pyx_GOTREF(__pyx_t_5); - if (__Pyx_IternextUnpackEndCheck(__pyx_t_7(__pyx_t_6), 2) < 0) __PYX_ERR(0, 344, __pyx_L1_error) + if (__Pyx_IternextUnpackEndCheck(__pyx_t_7(__pyx_t_6), 2) < 0) __PYX_ERR(0, 343, __pyx_L1_error) __pyx_t_7 = NULL; __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; goto __pyx_L5_unpacking_done; @@ -4087,18 +4078,18 @@ static PyObject *__pyx_pf_3fse_6models_13average_inner_train_average_cy(CYTHON_U __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; __pyx_t_7 = NULL; if (__Pyx_IterFinish() == 0) __Pyx_RaiseNeedMoreValuesError(index); - __PYX_ERR(0, 344, __pyx_L1_error) + __PYX_ERR(0, 343, __pyx_L1_error) __pyx_L5_unpacking_done:; } - __pyx_t_8 = __Pyx_PyInt_As_npy_uint32(__pyx_t_4); if (unlikely((__pyx_t_8 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 344, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; - __pyx_t_9 = __Pyx_PyInt_As_npy_uint32(__pyx_t_5); if (unlikely((__pyx_t_9 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 344, __pyx_L1_error) + __pyx_t_8 = __Pyx_PyInt_As_npy_uint32(__pyx_t_1); if (unlikely((__pyx_t_8 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 343, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_9 = __Pyx_PyInt_As_npy_uint32(__pyx_t_5); if (unlikely((__pyx_t_9 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 343, __pyx_L1_error) __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; __pyx_v_eff_sentences = __pyx_t_8; __pyx_v_eff_words = __pyx_t_9; - /* "fse/models/average_inner.pyx":346 - * eff_sentences, eff_words = populate_base_s2v_config(&w2v, model.wv.vocab, indexed_sentences) + /* "fse/models/average_inner.pyx":345 + * eff_sentences, eff_words = populate_base_s2v_config(&w2v, model.wv, indexed_sentences) * * with nogil: # <<<<<<<<<<<<<< * compute_base_sentence_averages(&w2v, eff_sentences) @@ -4112,7 +4103,7 @@ static PyObject *__pyx_pf_3fse_6models_13average_inner_train_average_cy(CYTHON_U #endif /*try:*/ { - /* "fse/models/average_inner.pyx":347 + /* "fse/models/average_inner.pyx":346 * * with nogil: * compute_base_sentence_averages(&w2v, eff_sentences) # <<<<<<<<<<<<<< @@ -4122,8 +4113,8 @@ static PyObject *__pyx_pf_3fse_6models_13average_inner_train_average_cy(CYTHON_U __pyx_f_3fse_6models_13average_inner_compute_base_sentence_averages((&__pyx_v_w2v), __pyx_v_eff_sentences); } - /* "fse/models/average_inner.pyx":346 - * eff_sentences, eff_words = populate_base_s2v_config(&w2v, model.wv.vocab, indexed_sentences) + /* "fse/models/average_inner.pyx":345 + * eff_sentences, eff_words = populate_base_s2v_config(&w2v, model.wv, indexed_sentences) * * with nogil: # <<<<<<<<<<<<<< * compute_base_sentence_averages(&w2v, eff_sentences) @@ -4141,7 +4132,7 @@ static PyObject *__pyx_pf_3fse_6models_13average_inner_train_average_cy(CYTHON_U } } - /* "fse/models/average_inner.pyx":341 + /* "fse/models/average_inner.pyx":340 * cdef FTSentenceVecsConfig ft * * if not model.is_ft: # <<<<<<<<<<<<<< @@ -4151,69 +4142,66 @@ static PyObject *__pyx_pf_3fse_6models_13average_inner_train_average_cy(CYTHON_U goto __pyx_L3; } - /* "fse/models/average_inner.pyx":349 + /* "fse/models/average_inner.pyx":348 * compute_base_sentence_averages(&w2v, eff_sentences) * else: * init_ft_s2v_config(&ft, model, target, memory) # <<<<<<<<<<<<<< * - * eff_sentences, eff_words = populate_ft_s2v_config(&ft, model.wv.vocab, indexed_sentences) + * eff_sentences, eff_words = populate_ft_s2v_config(&ft, model.wv, indexed_sentences) */ /*else*/ { - __pyx_t_1 = __pyx_f_3fse_6models_13average_inner_init_ft_s2v_config((&__pyx_v_ft), __pyx_v_model, __pyx_v_target, __pyx_v_memory); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 349, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_1); - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_4 = __pyx_f_3fse_6models_13average_inner_init_ft_s2v_config((&__pyx_v_ft), __pyx_v_model, __pyx_v_target, __pyx_v_memory); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 348, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; - /* "fse/models/average_inner.pyx":351 + /* "fse/models/average_inner.pyx":350 * init_ft_s2v_config(&ft, model, target, memory) * - * eff_sentences, eff_words = populate_ft_s2v_config(&ft, model.wv.vocab, indexed_sentences) # <<<<<<<<<<<<<< + * eff_sentences, eff_words = populate_ft_s2v_config(&ft, model.wv, indexed_sentences) # <<<<<<<<<<<<<< * * with nogil: */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_wv); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 351, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_1); - __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_vocab); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 351, __pyx_L1_error) + __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_wv); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 350, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_4); + __pyx_t_5 = __pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config((&__pyx_v_ft), __pyx_t_4, __pyx_v_indexed_sentences); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 350, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = __pyx_f_3fse_6models_13average_inner_populate_ft_s2v_config((&__pyx_v_ft), __pyx_t_5, __pyx_v_indexed_sentences); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 351, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_1); - __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; - if ((likely(PyTuple_CheckExact(__pyx_t_1))) || (PyList_CheckExact(__pyx_t_1))) { - PyObject* sequence = __pyx_t_1; + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + if ((likely(PyTuple_CheckExact(__pyx_t_5))) || (PyList_CheckExact(__pyx_t_5))) { + PyObject* sequence = __pyx_t_5; Py_ssize_t size = __Pyx_PySequence_SIZE(sequence); if (unlikely(size != 2)) { if (size > 2) __Pyx_RaiseTooManyValuesError(2); else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size); - __PYX_ERR(0, 351, __pyx_L1_error) + __PYX_ERR(0, 350, __pyx_L1_error) } #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS if (likely(PyTuple_CheckExact(sequence))) { - __pyx_t_5 = PyTuple_GET_ITEM(sequence, 0); - __pyx_t_4 = PyTuple_GET_ITEM(sequence, 1); + __pyx_t_4 = PyTuple_GET_ITEM(sequence, 0); + __pyx_t_1 = PyTuple_GET_ITEM(sequence, 1); } else { - __pyx_t_5 = PyList_GET_ITEM(sequence, 0); - __pyx_t_4 = PyList_GET_ITEM(sequence, 1); + __pyx_t_4 = PyList_GET_ITEM(sequence, 0); + __pyx_t_1 = PyList_GET_ITEM(sequence, 1); } - __Pyx_INCREF(__pyx_t_5); __Pyx_INCREF(__pyx_t_4); + __Pyx_INCREF(__pyx_t_1); #else - __pyx_t_5 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 351, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_5); - __pyx_t_4 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 351, __pyx_L1_error) + __pyx_t_4 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 350, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); + __pyx_t_1 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 350, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); #endif - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; } else { Py_ssize_t index = -1; - __pyx_t_6 = PyObject_GetIter(__pyx_t_1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 351, __pyx_L1_error) + __pyx_t_6 = PyObject_GetIter(__pyx_t_5); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 350, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_6); - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; __pyx_t_7 = Py_TYPE(__pyx_t_6)->tp_iternext; - index = 0; __pyx_t_5 = __pyx_t_7(__pyx_t_6); if (unlikely(!__pyx_t_5)) goto __pyx_L9_unpacking_failed; - __Pyx_GOTREF(__pyx_t_5); - index = 1; __pyx_t_4 = __pyx_t_7(__pyx_t_6); if (unlikely(!__pyx_t_4)) goto __pyx_L9_unpacking_failed; + index = 0; __pyx_t_4 = __pyx_t_7(__pyx_t_6); if (unlikely(!__pyx_t_4)) goto __pyx_L9_unpacking_failed; __Pyx_GOTREF(__pyx_t_4); - if (__Pyx_IternextUnpackEndCheck(__pyx_t_7(__pyx_t_6), 2) < 0) __PYX_ERR(0, 351, __pyx_L1_error) + index = 1; __pyx_t_1 = __pyx_t_7(__pyx_t_6); if (unlikely(!__pyx_t_1)) goto __pyx_L9_unpacking_failed; + __Pyx_GOTREF(__pyx_t_1); + if (__Pyx_IternextUnpackEndCheck(__pyx_t_7(__pyx_t_6), 2) < 0) __PYX_ERR(0, 350, __pyx_L1_error) __pyx_t_7 = NULL; __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; goto __pyx_L10_unpacking_done; @@ -4221,18 +4209,18 @@ static PyObject *__pyx_pf_3fse_6models_13average_inner_train_average_cy(CYTHON_U __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; __pyx_t_7 = NULL; if (__Pyx_IterFinish() == 0) __Pyx_RaiseNeedMoreValuesError(index); - __PYX_ERR(0, 351, __pyx_L1_error) + __PYX_ERR(0, 350, __pyx_L1_error) __pyx_L10_unpacking_done:; } - __pyx_t_9 = __Pyx_PyInt_As_npy_uint32(__pyx_t_5); if (unlikely((__pyx_t_9 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 351, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; - __pyx_t_8 = __Pyx_PyInt_As_npy_uint32(__pyx_t_4); if (unlikely((__pyx_t_8 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 351, __pyx_L1_error) + __pyx_t_9 = __Pyx_PyInt_As_npy_uint32(__pyx_t_4); if (unlikely((__pyx_t_9 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 350, __pyx_L1_error) __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __pyx_t_8 = __Pyx_PyInt_As_npy_uint32(__pyx_t_1); if (unlikely((__pyx_t_8 == ((npy_uint32)-1)) && PyErr_Occurred())) __PYX_ERR(0, 350, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_v_eff_sentences = __pyx_t_9; __pyx_v_eff_words = __pyx_t_8; - /* "fse/models/average_inner.pyx":353 - * eff_sentences, eff_words = populate_ft_s2v_config(&ft, model.wv.vocab, indexed_sentences) + /* "fse/models/average_inner.pyx":352 + * eff_sentences, eff_words = populate_ft_s2v_config(&ft, model.wv, indexed_sentences) * * with nogil: # <<<<<<<<<<<<<< * compute_ft_sentence_averages(&ft, eff_sentences) @@ -4246,7 +4234,7 @@ static PyObject *__pyx_pf_3fse_6models_13average_inner_train_average_cy(CYTHON_U #endif /*try:*/ { - /* "fse/models/average_inner.pyx":354 + /* "fse/models/average_inner.pyx":353 * * with nogil: * compute_ft_sentence_averages(&ft, eff_sentences) # <<<<<<<<<<<<<< @@ -4256,8 +4244,8 @@ static PyObject *__pyx_pf_3fse_6models_13average_inner_train_average_cy(CYTHON_U __pyx_f_3fse_6models_13average_inner_compute_ft_sentence_averages((&__pyx_v_ft), __pyx_v_eff_sentences); } - /* "fse/models/average_inner.pyx":353 - * eff_sentences, eff_words = populate_ft_s2v_config(&ft, model.wv.vocab, indexed_sentences) + /* "fse/models/average_inner.pyx":352 + * eff_sentences, eff_words = populate_ft_s2v_config(&ft, model.wv, indexed_sentences) * * with nogil: # <<<<<<<<<<<<<< * compute_ft_sentence_averages(&ft, eff_sentences) @@ -4277,7 +4265,7 @@ static PyObject *__pyx_pf_3fse_6models_13average_inner_train_average_cy(CYTHON_U } __pyx_L3:; - /* "fse/models/average_inner.pyx":356 + /* "fse/models/average_inner.pyx":355 * compute_ft_sentence_averages(&ft, eff_sentences) * * return eff_sentences, eff_words # <<<<<<<<<<<<<< @@ -4285,23 +4273,23 @@ static PyObject *__pyx_pf_3fse_6models_13average_inner_train_average_cy(CYTHON_U * def init(): */ __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = __Pyx_PyInt_From_npy_uint32(__pyx_v_eff_sentences); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 356, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyInt_From_npy_uint32(__pyx_v_eff_sentences); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 355, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_5); + __pyx_t_1 = __Pyx_PyInt_From_npy_uint32(__pyx_v_eff_words); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 355, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_4 = __Pyx_PyInt_From_npy_uint32(__pyx_v_eff_words); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 356, __pyx_L1_error) + __pyx_t_4 = PyTuple_New(2); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 355, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_4); - __pyx_t_5 = PyTuple_New(2); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 356, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_5); + __Pyx_GIVEREF(__pyx_t_5); + PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_5); __Pyx_GIVEREF(__pyx_t_1); - PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_1); - __Pyx_GIVEREF(__pyx_t_4); - PyTuple_SET_ITEM(__pyx_t_5, 1, __pyx_t_4); + PyTuple_SET_ITEM(__pyx_t_4, 1, __pyx_t_1); + __pyx_t_5 = 0; __pyx_t_1 = 0; + __pyx_r = __pyx_t_4; __pyx_t_4 = 0; - __pyx_r = __pyx_t_5; - __pyx_t_5 = 0; goto __pyx_L0; - /* "fse/models/average_inner.pyx":312 + /* "fse/models/average_inner.pyx":311 * saxpy(&size, &inv_count, c.mem, &ONE, &c.sentence_vectors[sent_row], &ONE) * * def train_average_cy(model, indexed_sentences, target, memory): # <<<<<<<<<<<<<< @@ -4323,7 +4311,7 @@ static PyObject *__pyx_pf_3fse_6models_13average_inner_train_average_cy(CYTHON_U return __pyx_r; } -/* "fse/models/average_inner.pyx":358 +/* "fse/models/average_inner.pyx":357 * return eff_sentences, eff_words * * def init(): # <<<<<<<<<<<<<< @@ -4351,7 +4339,7 @@ static PyObject *__pyx_pf_3fse_6models_13average_inner_2init(CYTHON_UNUSED PyObj __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("init", 0); - /* "fse/models/average_inner.pyx":359 + /* "fse/models/average_inner.pyx":358 * * def init(): * return 1 # <<<<<<<<<<<<<< @@ -4363,7 +4351,7 @@ static PyObject *__pyx_pf_3fse_6models_13average_inner_2init(CYTHON_UNUSED PyObj __pyx_r = __pyx_int_1; goto __pyx_L0; - /* "fse/models/average_inner.pyx":358 + /* "fse/models/average_inner.pyx":357 * return eff_sentences, eff_words * * def init(): # <<<<<<<<<<<<<< @@ -5435,12 +5423,12 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = { {&__pyx_kp_s_fse_models_average_inner_pyx, __pyx_k_fse_models_average_inner_pyx, sizeof(__pyx_k_fse_models_average_inner_pyx), 0, 0, 1, 0}, {&__pyx_n_s_ft, __pyx_k_ft, sizeof(__pyx_k_ft), 0, 0, 1, 1}, {&__pyx_n_s_ft_hash_bytes, __pyx_k_ft_hash_bytes, sizeof(__pyx_k_ft_hash_bytes), 0, 0, 1, 1}, - {&__pyx_n_s_gensim_models__utils_any2vec, __pyx_k_gensim_models__utils_any2vec, sizeof(__pyx_k_gensim_models__utils_any2vec), 0, 0, 1, 1}, + {&__pyx_n_s_gensim_models_fasttext, __pyx_k_gensim_models_fasttext, sizeof(__pyx_k_gensim_models_fasttext), 0, 0, 1, 1}, {&__pyx_n_s_import, __pyx_k_import, sizeof(__pyx_k_import), 0, 0, 1, 1}, - {&__pyx_n_s_index, __pyx_k_index, sizeof(__pyx_k_index), 0, 0, 1, 1}, {&__pyx_n_s_indexed_sentences, __pyx_k_indexed_sentences, sizeof(__pyx_k_indexed_sentences), 0, 0, 1, 1}, {&__pyx_n_s_init, __pyx_k_init, sizeof(__pyx_k_init), 0, 0, 1, 1}, {&__pyx_n_s_is_ft, __pyx_k_is_ft, sizeof(__pyx_k_is_ft), 0, 0, 1, 1}, + {&__pyx_n_s_key_to_index, __pyx_k_key_to_index, sizeof(__pyx_k_key_to_index), 0, 0, 1, 1}, {&__pyx_n_s_main, __pyx_k_main, sizeof(__pyx_k_main), 0, 0, 1, 1}, {&__pyx_n_s_max, __pyx_k_max, sizeof(__pyx_k_max), 0, 0, 1, 1}, {&__pyx_n_s_max_n, __pyx_k_max_n, sizeof(__pyx_k_max_n), 0, 0, 1, 1}, @@ -5465,7 +5453,6 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = { {&__pyx_n_s_vectors, __pyx_k_vectors, sizeof(__pyx_k_vectors), 0, 0, 1, 1}, {&__pyx_n_s_vectors_ngrams, __pyx_k_vectors_ngrams, sizeof(__pyx_k_vectors_ngrams), 0, 0, 1, 1}, {&__pyx_n_s_vectors_vocab, __pyx_k_vectors_vocab, sizeof(__pyx_k_vectors_vocab), 0, 0, 1, 1}, - {&__pyx_n_s_vocab, __pyx_k_vocab, sizeof(__pyx_k_vocab), 0, 0, 1, 1}, {&__pyx_n_s_w2v, __pyx_k_w2v, sizeof(__pyx_k_w2v), 0, 0, 1, 1}, {&__pyx_n_s_word_weights, __pyx_k_word_weights, sizeof(__pyx_k_word_weights), 0, 0, 1, 1}, {&__pyx_n_s_workers, __pyx_k_workers, sizeof(__pyx_k_workers), 0, 0, 1, 1}, @@ -5473,8 +5460,8 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = { {0, 0, 0, 0, 0, 0, 0} }; static CYTHON_SMALL_CODE int __Pyx_InitCachedBuiltins(void) { - __pyx_builtin_enumerate = __Pyx_GetBuiltinName(__pyx_n_s_enumerate); if (!__pyx_builtin_enumerate) __PYX_ERR(0, 196, __pyx_L1_error) - __pyx_builtin_range = __Pyx_GetBuiltinName(__pyx_n_s_range); if (!__pyx_builtin_range) __PYX_ERR(0, 236, __pyx_L1_error) + __pyx_builtin_enumerate = __Pyx_GetBuiltinName(__pyx_n_s_enumerate); if (!__pyx_builtin_enumerate) __PYX_ERR(0, 195, __pyx_L1_error) + __pyx_builtin_range = __Pyx_GetBuiltinName(__pyx_n_s_range); if (!__pyx_builtin_range) __PYX_ERR(0, 235, __pyx_L1_error) __pyx_builtin_ImportError = __Pyx_GetBuiltinName(__pyx_n_s_ImportError); if (!__pyx_builtin_ImportError) __PYX_ERR(1, 945, __pyx_L1_error) return 0; __pyx_L1_error:; @@ -5507,26 +5494,26 @@ static CYTHON_SMALL_CODE int __Pyx_InitCachedConstants(void) { __Pyx_GOTREF(__pyx_tuple__2); __Pyx_GIVEREF(__pyx_tuple__2); - /* "fse/models/average_inner.pyx":312 + /* "fse/models/average_inner.pyx":311 * saxpy(&size, &inv_count, c.mem, &ONE, &c.sentence_vectors[sent_row], &ONE) * * def train_average_cy(model, indexed_sentences, target, memory): # <<<<<<<<<<<<<< * """Training on a sequence of sentences and update the target ndarray. * */ - __pyx_tuple__4 = PyTuple_Pack(8, __pyx_n_s_model, __pyx_n_s_indexed_sentences, __pyx_n_s_target, __pyx_n_s_memory, __pyx_n_s_eff_sentences, __pyx_n_s_eff_words, __pyx_n_s_w2v, __pyx_n_s_ft); if (unlikely(!__pyx_tuple__4)) __PYX_ERR(0, 312, __pyx_L1_error) + __pyx_tuple__4 = PyTuple_Pack(8, __pyx_n_s_model, __pyx_n_s_indexed_sentences, __pyx_n_s_target, __pyx_n_s_memory, __pyx_n_s_eff_sentences, __pyx_n_s_eff_words, __pyx_n_s_w2v, __pyx_n_s_ft); if (unlikely(!__pyx_tuple__4)) __PYX_ERR(0, 311, __pyx_L1_error) __Pyx_GOTREF(__pyx_tuple__4); __Pyx_GIVEREF(__pyx_tuple__4); - __pyx_codeobj__5 = (PyObject*)__Pyx_PyCode_New(4, 0, 8, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__4, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_fse_models_average_inner_pyx, __pyx_n_s_train_average_cy, 312, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__5)) __PYX_ERR(0, 312, __pyx_L1_error) + __pyx_codeobj__5 = (PyObject*)__Pyx_PyCode_New(4, 0, 8, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__4, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_fse_models_average_inner_pyx, __pyx_n_s_train_average_cy, 311, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__5)) __PYX_ERR(0, 311, __pyx_L1_error) - /* "fse/models/average_inner.pyx":358 + /* "fse/models/average_inner.pyx":357 * return eff_sentences, eff_words * * def init(): # <<<<<<<<<<<<<< * return 1 * */ - __pyx_codeobj__6 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_fse_models_average_inner_pyx, __pyx_n_s_init, 358, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__6)) __PYX_ERR(0, 358, __pyx_L1_error) + __pyx_codeobj__6 = (PyObject*)__Pyx_PyCode_New(0, 0, 0, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_fse_models_average_inner_pyx, __pyx_n_s_init, 357, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__6)) __PYX_ERR(0, 357, __pyx_L1_error) __Pyx_RefNannyFinishContext(); return 0; __pyx_L1_error:; @@ -5881,7 +5868,7 @@ if (!__Pyx_RefNanny) { /* "fse/models/average_inner.pyx":18 * cimport numpy as np * - * from gensim.models._utils_any2vec import compute_ngrams_bytes, ft_hash_bytes # <<<<<<<<<<<<<< + * from gensim.models.fasttext import compute_ngrams_bytes, ft_hash_bytes # <<<<<<<<<<<<<< * * from libc.string cimport memset */ @@ -5893,7 +5880,7 @@ if (!__Pyx_RefNanny) { __Pyx_INCREF(__pyx_n_s_ft_hash_bytes); __Pyx_GIVEREF(__pyx_n_s_ft_hash_bytes); PyList_SET_ITEM(__pyx_t_1, 1, __pyx_n_s_ft_hash_bytes); - __pyx_t_2 = __Pyx_Import(__pyx_n_s_gensim_models__utils_any2vec, __pyx_t_1, -1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 18, __pyx_L1_error) + __pyx_t_2 = __Pyx_Import(__pyx_n_s_gensim_models_fasttext, __pyx_t_1, -1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 18, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_compute_ngrams_bytes); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 18, __pyx_L1_error) @@ -5996,58 +5983,58 @@ if (!__Pyx_RefNanny) { */ __pyx_v_3fse_6models_13average_inner_ZEROF = ((__pyx_t_3fse_6models_13average_inner_REAL_t)0.0); - /* "fse/models/average_inner.pyx":312 + /* "fse/models/average_inner.pyx":311 * saxpy(&size, &inv_count, c.mem, &ONE, &c.sentence_vectors[sent_row], &ONE) * * def train_average_cy(model, indexed_sentences, target, memory): # <<<<<<<<<<<<<< * """Training on a sequence of sentences and update the target ndarray. * */ - __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_3fse_6models_13average_inner_1train_average_cy, NULL, __pyx_n_s_fse_models_average_inner); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 312, __pyx_L1_error) + __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_3fse_6models_13average_inner_1train_average_cy, NULL, __pyx_n_s_fse_models_average_inner); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 311, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_train_average_cy, __pyx_t_1) < 0) __PYX_ERR(0, 312, __pyx_L1_error) + if (PyDict_SetItem(__pyx_d, __pyx_n_s_train_average_cy, __pyx_t_1) < 0) __PYX_ERR(0, 311, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "fse/models/average_inner.pyx":358 + /* "fse/models/average_inner.pyx":357 * return eff_sentences, eff_words * * def init(): # <<<<<<<<<<<<<< * return 1 * */ - __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_3fse_6models_13average_inner_3init, NULL, __pyx_n_s_fse_models_average_inner); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 358, __pyx_L1_error) + __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_3fse_6models_13average_inner_3init, NULL, __pyx_n_s_fse_models_average_inner); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 357, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_init, __pyx_t_1) < 0) __PYX_ERR(0, 358, __pyx_L1_error) + if (PyDict_SetItem(__pyx_d, __pyx_n_s_init, __pyx_t_1) < 0) __PYX_ERR(0, 357, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - /* "fse/models/average_inner.pyx":361 + /* "fse/models/average_inner.pyx":360 * return 1 * * MAX_WORDS_IN_BATCH = MAX_WORDS # <<<<<<<<<<<<<< * MAX_NGRAMS_IN_BATCH = MAX_NGRAMS * FAST_VERSION = init() */ - if (PyDict_SetItem(__pyx_d, __pyx_n_s_MAX_WORDS_IN_BATCH, __pyx_int_10000) < 0) __PYX_ERR(0, 361, __pyx_L1_error) + if (PyDict_SetItem(__pyx_d, __pyx_n_s_MAX_WORDS_IN_BATCH, __pyx_int_10000) < 0) __PYX_ERR(0, 360, __pyx_L1_error) - /* "fse/models/average_inner.pyx":362 + /* "fse/models/average_inner.pyx":361 * * MAX_WORDS_IN_BATCH = MAX_WORDS * MAX_NGRAMS_IN_BATCH = MAX_NGRAMS # <<<<<<<<<<<<<< * FAST_VERSION = init() */ - if (PyDict_SetItem(__pyx_d, __pyx_n_s_MAX_NGRAMS_IN_BATCH, __pyx_int_40) < 0) __PYX_ERR(0, 362, __pyx_L1_error) + if (PyDict_SetItem(__pyx_d, __pyx_n_s_MAX_NGRAMS_IN_BATCH, __pyx_int_40) < 0) __PYX_ERR(0, 361, __pyx_L1_error) - /* "fse/models/average_inner.pyx":363 + /* "fse/models/average_inner.pyx":362 * MAX_WORDS_IN_BATCH = MAX_WORDS * MAX_NGRAMS_IN_BATCH = MAX_NGRAMS * FAST_VERSION = init() # <<<<<<<<<<<<<< */ - __Pyx_GetModuleGlobalName(__pyx_t_1, __pyx_n_s_init); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 363, __pyx_L1_error) + __Pyx_GetModuleGlobalName(__pyx_t_1, __pyx_n_s_init); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 362, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_PyObject_CallNoArg(__pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 363, __pyx_L1_error) + __pyx_t_2 = __Pyx_PyObject_CallNoArg(__pyx_t_1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 362, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - if (PyDict_SetItem(__pyx_d, __pyx_n_s_FAST_VERSION, __pyx_t_2) < 0) __PYX_ERR(0, 363, __pyx_L1_error) + if (PyDict_SetItem(__pyx_d, __pyx_n_s_FAST_VERSION, __pyx_t_2) < 0) __PYX_ERR(0, 362, __pyx_L1_error) __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; /* "fse/models/average_inner.pyx":1 diff --git a/fse/models/average_inner.pyx b/fse/models/average_inner.pyx index 5c3c65b..2222447 100644 --- a/fse/models/average_inner.pyx +++ b/fse/models/average_inner.pyx @@ -15,7 +15,7 @@ import numpy as np cimport numpy as np -from gensim.models._utils_any2vec import compute_ngrams_bytes, ft_hash_bytes +from gensim.models.fasttext import compute_ngrams_bytes, ft_hash_bytes from libc.string cimport memset from libc.stdio cimport printf @@ -96,7 +96,7 @@ cdef init_ft_s2v_config(FTSentenceVecsConfig *c, model, target, memory): c[0].sentence_vectors = (np.PyArray_DATA(target)) -cdef object populate_base_s2v_config(BaseSentenceVecsConfig *c, vocab, indexed_sentences): +cdef object populate_base_s2v_config(BaseSentenceVecsConfig *c, wv, indexed_sentences): """Prepare C structures for BaseAny2VecModel so we can go "full C" and release the Python GIL. We create indices over the sentences. We also perform some calculations for @@ -106,8 +106,8 @@ cdef object populate_base_s2v_config(BaseSentenceVecsConfig *c, vocab, indexed_s ---------- c : BaseSentenceVecsConfig* A pointer to the struct that will contain the populated indices. - vocab : dict - The vocabulary + wv : obj + The word vector object indexed_sentences : iterable of tuple The sentences to read @@ -129,10 +129,10 @@ cdef object populate_base_s2v_config(BaseSentenceVecsConfig *c, vocab, indexed_s if not obj[0]: continue for token in obj[0]: - word = vocab[token] if token in vocab else None # Vocab obj + word = token if token in wv.key_to_index else None if word is None: continue - c.word_indices[eff_words] = word.index + c.word_indices[eff_words] = wv.key_to_index[token] c.sent_adresses[eff_words] = obj[1] eff_words += ONE @@ -146,7 +146,7 @@ cdef object populate_base_s2v_config(BaseSentenceVecsConfig *c, vocab, indexed_s return eff_sents, eff_words -cdef object populate_ft_s2v_config(FTSentenceVecsConfig *c, vocab, indexed_sentences): +cdef object populate_ft_s2v_config(FTSentenceVecsConfig *c, wv, indexed_sentences): """Prepare C structures for FastText so we can go "full C" and release the Python GIL. We create indices over the sentences. We also perform some calculations for @@ -156,8 +156,8 @@ cdef object populate_ft_s2v_config(FTSentenceVecsConfig *c, vocab, indexed_sente ---------- c : FTSentenceVecsConfig* A pointer to the struct that will contain the populated indices. - vocab : dict - The vocabulary + wv : obj + The word vector object indexed_sentences : iterable of tuples The sentences to read @@ -180,10 +180,9 @@ cdef object populate_ft_s2v_config(FTSentenceVecsConfig *c, vocab, indexed_sente continue for token in obj[0]: c.sent_adresses[eff_words] = obj[1] - if token in vocab: + if token in wv.key_to_index: # In Vocabulary - word = vocab[token] - c.word_indices[eff_words] = word.index + c.word_indices[eff_words] = wv.key_to_index[token] c.subwords_idx_len[eff_words] = ZERO else: # OOV words --> write ngram indices to memory @@ -341,14 +340,14 @@ def train_average_cy(model, indexed_sentences, target, memory): if not model.is_ft: init_base_s2v_config(&w2v, model, target, memory) - eff_sentences, eff_words = populate_base_s2v_config(&w2v, model.wv.vocab, indexed_sentences) + eff_sentences, eff_words = populate_base_s2v_config(&w2v, model.wv, indexed_sentences) with nogil: compute_base_sentence_averages(&w2v, eff_sentences) else: init_ft_s2v_config(&ft, model, target, memory) - eff_sentences, eff_words = populate_ft_s2v_config(&ft, model.wv.vocab, indexed_sentences) + eff_sentences, eff_words = populate_ft_s2v_config(&ft, model.wv, indexed_sentences) with nogil: compute_ft_sentence_averages(&ft, eff_sentences) diff --git a/fse/models/base_s2v.py b/fse/models/base_s2v.py index dc9dd60..e7a1ef3 100644 --- a/fse/models/base_s2v.py +++ b/fse/models/base_s2v.py @@ -11,7 +11,7 @@ Attributes ---------- -wv : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` +wv : :class:`~gensim.models.keyedvectors.KeyedVectors` This object essentially contains the mapping between words and embeddings. After training, it can be used directly to query those embeddings in various ways. See the module level docstring for examples. @@ -34,12 +34,13 @@ """ -from fse.models.sentencevectors import SentenceVectors +from fse.models.sentencevectors import SentenceVectors, _l2_norm from fse.models.utils import set_madvise_for_mmap -from gensim.models.base_any2vec import BaseWordEmbeddingsModel -from gensim.models.keyedvectors import BaseKeyedVectors, FastTextKeyedVectors, _l2_norm +from gensim.models import Word2Vec, FastText +from gensim.models.keyedvectors import KeyedVectors +from gensim.models.fasttext import FastTextKeyedVectors from gensim.utils import SaveLoad from gensim.matutils import zeros_aligned @@ -59,7 +60,7 @@ from wordfreq import available_languages, get_frequency_dict -from typing import List, Dict +from typing import List, Dict, Tuple from time import time from psutil import virtual_memory @@ -81,7 +82,7 @@ class BaseSentence2VecModel(SaveLoad): def __init__( self, - model: BaseKeyedVectors, + model: KeyedVectors, sv_mapfile_path: str = None, wv_mapfile_path: str = None, workers: int = 1, @@ -96,9 +97,9 @@ def __init__( Parameters ---------- - model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` + model : :class:`~gensim.models.keyedvectors.KeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings - the wv.vocab and wv.vector elements are required. + the wv and wv.vector elements are required. sv_mapfile_path : str, optional Optional path to store the sentence-vectors in for very large datasets. Used for memmap. wv_mapfile_path : str, optional @@ -110,7 +111,7 @@ def __init__( lang_freq : str, optional Some pre-trained embeddings, i.e. "GoogleNews-vectors-negative300.bin", do not contain information about the frequency of a word. As the frequency is required for estimating the word weights, we induce - frequencies into the wv.vocab.count based on :class:`~wordfreq` + frequencies into the wv based on :class:`~wordfreq` If no frequency information is available, you can choose the language to estimate the frequency. See https://github.com/LuminosoInsight/wordfreq fast_version : {-1, 1}, optional @@ -157,7 +158,7 @@ def __init__( ) self.prep = BaseSentence2VecPreparer() - self.word_weights = ones(len(self.wv.vocab), REAL) + self.word_weights = ones(len(self.wv), REAL) def __str__(self) -> str: """Human readable representation of the model's state. @@ -168,24 +169,26 @@ def __str__(self) -> str: Human readable representation of the model's state. """ - return f"{self.__class__.__name__} based on {self.wv.__class__.__name__}, size={len(self.sv)}" + return f"{self.__class__.__name__} based on {self.wv.__class__.__name__}, vector_size={len(self.sv)}" - def _check_and_include_model(self, model: BaseKeyedVectors): + def _check_and_include_model(self, model: KeyedVectors): """Check if the supplied model is a compatible model. Performs all kinds of checks and small optimizations. Parameters ---------- - model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` + model : :class:`~gensim.models.keyedvectors.KeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` The model to inject into this class. """ - if isinstance(model, BaseWordEmbeddingsModel): + if isinstance(model, (Word2Vec, FastText)): + if not hasattr(model, "wv"): + raise RuntimeError("Model does not contain wv object.") self.wv = model.wv - elif isinstance(model, BaseKeyedVectors): + elif isinstance(model, KeyedVectors): self.wv = model else: raise RuntimeError( - f"Model must be child of BaseWordEmbeddingsModel or BaseKeyedVectors. Received {str(model)}" + f"Model must be child of BaseWordEmbeddingsModel or KeyedVectors. Received {str(model)}" ) self.wv.vectors_norm = None @@ -210,8 +213,6 @@ def _check_and_include_model(self, model: BaseKeyedVectors): raise RuntimeError( "Word vectors required for sentence embeddings not found." ) - if not hasattr(self.wv, "vocab"): - raise RuntimeError("Vocab required for sentence embeddings not found.") def _check_language_settings(self, lang_freq: str): """Check if the supplied language is a compatible with the wordfreq package @@ -219,7 +220,7 @@ def _check_language_settings(self, lang_freq: str): Parameters ---------- lang_freq : str - The language used to induce the frequencies into the wv.vocab object. + The language used to induce the frequencies into the wv object. """ if lang_freq in available_languages(wordlist="best"): @@ -241,11 +242,11 @@ def _induce_frequencies(self, domain: int = 2 ** 31 - 1): """ freq_dict = get_frequency_dict(self.lang_freq, wordlist="best") - for word in self.wv.index2word: + for word in self.wv.index_to_key: if word in freq_dict: - self.wv.vocab[word].count = int(freq_dict[word] * domain) + self.wv.set_vecattr(word, "count", int(freq_dict[word] * domain)) else: - self.wv.vocab[word].count = int(1e-8 * domain) + self.wv.set_vecattr(word, "count", int(1e-8 * domain)) def _check_input_data_sanity(self, data_iterable: tuple): """Check if the input data complies with the required formats @@ -299,7 +300,7 @@ def _check_pre_training_sanity( """ if not hasattr(self, "wv") or self.wv is None: - raise RuntimeError("you must first load a valid BaseKeyedVectors object") + raise RuntimeError("you must first load a valid KeyedVectors object") if not len(self.wv.vectors): raise RuntimeError( "you must initialize vectors before computing sentence vectors" @@ -314,7 +315,9 @@ def _check_pre_training_sanity( "you must initialize vectors_vocab before computing sentence vectors" ) - if sum([self.wv.vocab[w].count for w in self.wv.vocab]) == len(self.wv.vocab): + if sum([self.wv.get_vecattr(w, "count") for w in self.wv.key_to_index]) == len( + self.wv + ): logger.warning( "The sum of the word counts is equal to its length (all word counts are 1). " "Make sure to obtain proper word counts by using lang_freq for pretrained embeddings." @@ -364,7 +367,7 @@ def _check_post_training_sanity(self, eff_sentences: int, eff_words: int): def _check_indexed_sent_valid( self, iterPos: int, obj: tuple, checked: int = False - ) -> [int, List[str]]: + ) -> Tuple[int, List[str]]: """Performs a check if the passed object contains valid data Parameters @@ -493,7 +496,7 @@ def _move_ndarray_to_disk( readonly_memvecs = np_memmap(path, dtype=REAL, mode="r", shape=shape) return readonly_memvecs - def _get_thread_working_mem(self) -> [ndarray, ndarray]: + def _get_thread_working_mem(self) -> Tuple[ndarray, ndarray]: """Computes the memory used per worker thread. Returns @@ -508,7 +511,7 @@ def _get_thread_working_mem(self) -> [ndarray, ndarray]: def _do_train_job( self, data_iterable: List[tuple], target: ndarray, memory: ndarray - ) -> [int, int]: + ) -> Tuple[int, int]: """ Function to be called on a batch of sentences. Returns eff sentences/words """ raise NotImplementedError() @@ -574,9 +577,6 @@ def save(self, *args, **kwargs): # Manually removes vectors from the wv class because we cannot modify the save method if self.wv_mapfile_path is not None: self.wv.vectors = None - if self.is_ft: - self.wv.vectors_vocab = None - self.wv.vectors_ngrams = None super(BaseSentence2VecModel, self).save(*args, **kwargs) def scan_sentences( @@ -703,7 +703,7 @@ def train( update: bool = False, queue_factor: int = 2, report_delay: int = 5, - ) -> [int, int]: + ) -> Tuple[int, int]: """Main routine to train an embedding. This method writes all sentences vectors into sv.vectors and is used for computing embeddings for large chunks of data. This method also handles post-training transformations, such as computing the SVD of the sentence vectors. diff --git a/fse/models/sentencevectors.py b/fse/models/sentencevectors.py index a4ed096..f614469 100644 --- a/fse/models/sentencevectors.py +++ b/fse/models/sentencevectors.py @@ -11,7 +11,7 @@ from fse.models.utils import set_madvise_for_mmap -from gensim.models.keyedvectors import BaseKeyedVectors +from gensim.models.keyedvectors import KeyedVectors from numpy import ( dot, @@ -137,7 +137,7 @@ def get_vector(self, index: int, use_norm: bool = False) -> ndarray: ---------- index : int Input index - use_norm : bool, optional + norm : bool, optional If True - resulting vector will be L2-normalized (unit euclidean length). Returns @@ -328,7 +328,7 @@ def most_similar( def similar_by_word( self, word: str, - wv: BaseKeyedVectors, + wv: KeyedVectors, indexable: Union[IndexedList, IndexedLineDocument] = None, topn: int = 10, restrict_size: Union[int, Tuple[int, int]] = None, @@ -340,7 +340,7 @@ def similar_by_word( ---------- word : str Word - wv : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` + wv : :class:`~gensim.models.keyedvectors.KeyedVectors` This object essentially contains the mapping between words and embeddings. indexable: list, IndexedList, IndexedLineDocument Provides an indexable object from where the most similar sentences are read diff --git a/fse/models/sif.py b/fse/models/sif.py index 165b5e9..fa921f4 100644 --- a/fse/models/sif.py +++ b/fse/models/sif.py @@ -7,7 +7,7 @@ from fse.models.average import Average from fse.models.utils import compute_principal_components, remove_principal_components -from gensim.models.keyedvectors import BaseKeyedVectors +from gensim.models.keyedvectors import KeyedVectors from numpy import ndarray, float32 as REAL, zeros, isfinite @@ -19,7 +19,7 @@ class SIF(Average): def __init__( self, - model: BaseKeyedVectors, + model: KeyedVectors, alpha: float = 1e-3, components: int = 1, cache_size_gb: float = 1.0, @@ -36,7 +36,7 @@ def __init__( Parameters ---------- - model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` + model : :class:`~gensim.models.keyedvectors.KeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings the wv.vocab and wv.vector elements are required. alpha : float, optional @@ -133,15 +133,17 @@ def _check_dtype_santiy(self): def _compute_sif_weights(self): """ Precomputes the SIF weights for all words in the vocabulary """ - logger.info(f"pre-computing SIF weights for {len(self.wv.vocab)} words") - v = len(self.wv.vocab) + logger.info(f"pre-computing SIF weights for {len(self.wv)} words") + v = len(self.wv) corpus_size = 0 pw = zeros(v, dtype=REAL) - for word in self.wv.vocab: - c = self.wv.vocab[word].count + for word in self.wv.key_to_index: + c = self.wv.get_vecattr(word, "count") + if c < 0: + raise ValueError("vocab count is negative") corpus_size += c - pw[self.wv.vocab[word].index] = c + pw[self.wv.key_to_index[word]] = c pw /= corpus_size self.word_weights = (self.alpha / (self.alpha + pw)).astype(REAL) diff --git a/fse/models/usif.py b/fse/models/usif.py index fbe9acb..5e29be6 100644 --- a/fse/models/usif.py +++ b/fse/models/usif.py @@ -6,7 +6,7 @@ import logging -from gensim.models.keyedvectors import BaseKeyedVectors +from gensim.models.keyedvectors import KeyedVectors from numpy import float32 as REAL from numpy import isfinite, ndarray, zeros @@ -23,7 +23,7 @@ class uSIF(Average): def __init__( self, - model: BaseKeyedVectors, + model: KeyedVectors, length: int = None, components: int = 5, cache_size_gb: float = 1.0, @@ -41,7 +41,7 @@ def __init__( Parameters ---------- - model : :class:`~gensim.models.keyedvectors.BaseKeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` + model : :class:`~gensim.models.keyedvectors.KeyedVectors` or :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` This object essentially contains the mapping between words and embeddings. To compute the sentence embeddings the wv.vocab and wv.vector elements are required. length : int, optional @@ -153,15 +153,17 @@ def _check_dtype_santiy(self): def _compute_usif_weights(self): """Precomputes the uSIF weights.""" - logger.info(f"pre-computing uSIF weights for {len(self.wv.vocab)} words") - v = len(self.wv.vocab) + logger.info(f"pre-computing uSIF weights for {len(self.wv)} words") + v = len(self.wv) corpus_size = 0 pw = zeros(v, dtype=REAL) - for word in self.wv.vocab: - c = self.wv.vocab[word].count + for word in self.wv.key_to_index: + c = self.wv.get_vecattr(word, "count") + if c < 0: + raise ValueError("vocab count is negative") corpus_size += c - pw[self.wv.vocab[word].index] = c + pw[self.wv.key_to_index[word]] = c pw /= corpus_size threshold = 1 - (1 - (1 / v)) ** self.length diff --git a/fse/vectors.py b/fse/vectors.py index 5e243bc..ca6a872 100644 --- a/fse/vectors.py +++ b/fse/vectors.py @@ -9,14 +9,15 @@ from pathlib import Path -from gensim.models.keyedvectors import FastTextKeyedVectors, Word2VecKeyedVectors +from gensim.models.fasttext import FastTextKeyedVectors +from gensim.models.keyedvectors import KeyedVectors from huggingface_hub import snapshot_download from requests import HTTPError _SUFFIX: str = ".model" -class Vectors(Word2VecKeyedVectors): +class Vectors(KeyedVectors): """Class to instantiates vectors from pretrained models.""" @classmethod diff --git a/notebooks/STS-Benchmarks.ipynb b/notebooks/STS-Benchmarks.ipynb index 8274d54..df3011f 100644 --- a/notebooks/STS-Benchmarks.ipynb +++ b/notebooks/STS-Benchmarks.ipynb @@ -214,70 +214,1960 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:19:01,771 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse__fasttext-wiki-news-subwords-300.ef21870476cd93435d83140fcf6e7171b517e337/fasttext-wiki-news-subwords-300.model\n", - "2021-12-03 09:19:03,869 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse__fasttext-wiki-news-subwords-300.ef21870476cd93435d83140fcf6e7171b517e337/fasttext-wiki-news-subwords-300.model.vectors.npy with mmap=r\n", - "2021-12-03 09:19:03,872 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", - "2021-12-03 09:19:03,873 : MainThread : INFO : loaded /home/oborchers/.cache/huggingface/hub/fse__fasttext-wiki-news-subwords-300.ef21870476cd93435d83140fcf6e7171b517e337/fasttext-wiki-news-subwords-300.model\n", - "2021-12-03 09:19:06,810 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse__glove-twitter-100.7ef1c21d9bf90598a0c618c041d9817e50250183/glove-twitter-100.model\n", - "2021-12-03 09:19:09,430 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse__glove-twitter-100.7ef1c21d9bf90598a0c618c041d9817e50250183/glove-twitter-100.model.vectors.npy with mmap=r\n", - "2021-12-03 09:19:09,434 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", - "2021-12-03 09:19:09,434 : MainThread : INFO : loaded /home/oborchers/.cache/huggingface/hub/fse__glove-twitter-100.7ef1c21d9bf90598a0c618c041d9817e50250183/glove-twitter-100.model\n", - "2021-12-03 09:19:14,017 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse__glove-twitter-200.72f480c107aaa58b9474ddaf45d13db2e34fa166/glove-twitter-200.model\n", - "2021-12-03 09:19:16,919 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse__glove-twitter-200.72f480c107aaa58b9474ddaf45d13db2e34fa166/glove-twitter-200.model.vectors.npy with mmap=r\n", - "2021-12-03 09:19:16,921 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", - "2021-12-03 09:19:16,922 : MainThread : INFO : loaded /home/oborchers/.cache/huggingface/hub/fse__glove-twitter-200.72f480c107aaa58b9474ddaf45d13db2e34fa166/glove-twitter-200.model\n", - "2021-12-03 09:19:19,183 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse__glove-twitter-25.5ec1e20fb42502d60c4676070bad354ec71aa9aa/glove-twitter-25.model\n", - "2021-12-03 09:19:21,933 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse__glove-twitter-25.5ec1e20fb42502d60c4676070bad354ec71aa9aa/glove-twitter-25.model.vectors.npy with mmap=r\n", - "2021-12-03 09:19:21,934 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", - "2021-12-03 09:19:21,934 : MainThread : INFO : loaded /home/oborchers/.cache/huggingface/hub/fse__glove-twitter-25.5ec1e20fb42502d60c4676070bad354ec71aa9aa/glove-twitter-25.model\n", - "2021-12-03 09:19:24,991 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse__glove-twitter-50.38339c079845641fe59690e5a147fab348a2eb29/glove-twitter-50.model\n", - "2021-12-03 09:19:27,978 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse__glove-twitter-50.38339c079845641fe59690e5a147fab348a2eb29/glove-twitter-50.model.vectors.npy with mmap=r\n", - "2021-12-03 09:19:27,980 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", - "2021-12-03 09:19:27,981 : MainThread : INFO : loaded /home/oborchers/.cache/huggingface/hub/fse__glove-twitter-50.38339c079845641fe59690e5a147fab348a2eb29/glove-twitter-50.model\n", - "2021-12-03 09:19:37,180 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse__glove-wiki-gigaword-100.3282d5e7c5e979c2411ba9703d63a46243a2047e/glove-wiki-gigaword-100.model\n", - "2021-12-03 09:19:39,388 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse__glove-wiki-gigaword-100.3282d5e7c5e979c2411ba9703d63a46243a2047e/glove-wiki-gigaword-100.model.vectors.npy with mmap=r\n", - "2021-12-03 09:19:39,390 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", - "2021-12-03 09:19:39,391 : MainThread : INFO : loaded /home/oborchers/.cache/huggingface/hub/fse__glove-wiki-gigaword-100.3282d5e7c5e979c2411ba9703d63a46243a2047e/glove-wiki-gigaword-100.model\n", - "2021-12-03 09:19:50,607 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse__glove-wiki-gigaword-200.96a689f1f194ddd2615e41c852396c1fb50e5882/glove-wiki-gigaword-200.model\n", - "2021-12-03 09:19:51,180 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse__glove-wiki-gigaword-200.96a689f1f194ddd2615e41c852396c1fb50e5882/glove-wiki-gigaword-200.model.vectors.npy with mmap=r\n", - "2021-12-03 09:19:51,182 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", - "2021-12-03 09:19:51,183 : MainThread : INFO : loaded /home/oborchers/.cache/huggingface/hub/fse__glove-wiki-gigaword-200.96a689f1f194ddd2615e41c852396c1fb50e5882/glove-wiki-gigaword-200.model\n", - "2021-12-03 09:19:53,192 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse__glove-wiki-gigaword-300.242f9d6f62200e8b1a2aedfc22e4d673c0549add/glove-wiki-gigaword-300.model\n", - "2021-12-03 09:19:53,757 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse__glove-wiki-gigaword-300.242f9d6f62200e8b1a2aedfc22e4d673c0549add/glove-wiki-gigaword-300.model.vectors.npy with mmap=r\n", - "2021-12-03 09:19:53,759 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", - "2021-12-03 09:19:53,759 : MainThread : INFO : loaded /home/oborchers/.cache/huggingface/hub/fse__glove-wiki-gigaword-300.242f9d6f62200e8b1a2aedfc22e4d673c0549add/glove-wiki-gigaword-300.model\n", - "2021-12-03 09:19:55,731 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse__glove-wiki-gigaword-50.d2d3bc131d1c28de59b055d6724c742bda902bcf/glove-wiki-gigaword-50.model\n", - "2021-12-03 09:19:56,243 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse__glove-wiki-gigaword-50.d2d3bc131d1c28de59b055d6724c742bda902bcf/glove-wiki-gigaword-50.model.vectors.npy with mmap=r\n", - "2021-12-03 09:19:56,245 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", - "2021-12-03 09:19:56,246 : MainThread : INFO : loaded /home/oborchers/.cache/huggingface/hub/fse__glove-wiki-gigaword-50.d2d3bc131d1c28de59b055d6724c742bda902bcf/glove-wiki-gigaword-50.model\n", - "2021-12-03 09:20:00,290 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse__paragram-25.d27454408fa98c7bf128e58602f66775d23d532c/paragram-25.model\n", - "2021-12-03 09:20:02,504 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", - "2021-12-03 09:20:02,505 : MainThread : INFO : loaded /home/oborchers/.cache/huggingface/hub/fse__paragram-25.d27454408fa98c7bf128e58602f66775d23d532c/paragram-25.model\n", - "2021-12-03 09:20:04,476 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse__paragram-300-sl999.d16350f324c00fadb0f7ed05f8a9df130d950aab/paragram-300-sl999.model\n", - "2021-12-03 09:20:06,554 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse__paragram-300-sl999.d16350f324c00fadb0f7ed05f8a9df130d950aab/paragram-300-sl999.model.vectors.npy with mmap=r\n", - "2021-12-03 09:20:06,556 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", - "2021-12-03 09:20:06,557 : MainThread : INFO : loaded /home/oborchers/.cache/huggingface/hub/fse__paragram-300-sl999.d16350f324c00fadb0f7ed05f8a9df130d950aab/paragram-300-sl999.model\n", - "2021-12-03 09:20:08,524 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse__paragram-300-ws353.cda1e084a44a24a769d595e5b6caf7ee7a8500b5/paragram-300-ws353.model\n", - "2021-12-03 09:20:12,518 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse__paragram-300-ws353.cda1e084a44a24a769d595e5b6caf7ee7a8500b5/paragram-300-ws353.model.vectors.npy with mmap=r\n", - "2021-12-03 09:20:12,520 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", - "2021-12-03 09:20:12,520 : MainThread : INFO : loaded /home/oborchers/.cache/huggingface/hub/fse__paragram-300-ws353.cda1e084a44a24a769d595e5b6caf7ee7a8500b5/paragram-300-ws353.model\n", - "2021-12-03 09:20:15,531 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse__paranmt-300.ae7612b27a2516b44a42bbc148f9936332d30847/paranmt-300.model\n", - "2021-12-03 09:20:15,736 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse__paranmt-300.ae7612b27a2516b44a42bbc148f9936332d30847/paranmt-300.model.vectors.npy with mmap=r\n", - "2021-12-03 09:20:15,738 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", - "2021-12-03 09:20:15,739 : MainThread : INFO : loaded /home/oborchers/.cache/huggingface/hub/fse__paranmt-300.ae7612b27a2516b44a42bbc148f9936332d30847/paranmt-300.model\n", - "2021-12-03 09:20:30,789 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse__word2vec-google-news-300.528f381952a0b7d777bb4a611c4a43f588d48994/word2vec-google-news-300.model\n", - "2021-12-03 09:20:37,419 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse__word2vec-google-news-300.528f381952a0b7d777bb4a611c4a43f588d48994/word2vec-google-news-300.model.vectors.npy with mmap=r\n", - "2021-12-03 09:20:37,421 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", - "2021-12-03 09:20:37,422 : MainThread : INFO : loaded /home/oborchers/.cache/huggingface/hub/fse__word2vec-google-news-300.528f381952a0b7d777bb4a611c4a43f588d48994/word2vec-google-news-300.model\n", - "2021-12-03 09:20:42,324 : MainThread : INFO : loading FTVectors object from /home/oborchers/.cache/huggingface/hub/fse__fasttext-crawl-subwords-300.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/fasttext-crawl-subwords-300.model\n", - "2021-12-03 09:20:48,571 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse__fasttext-crawl-subwords-300.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/fasttext-crawl-subwords-300.model.vectors.npy with mmap=r\n", - "2021-12-03 09:20:48,573 : MainThread : INFO : loading vectors_vocab from /home/oborchers/.cache/huggingface/hub/fse__fasttext-crawl-subwords-300.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/fasttext-crawl-subwords-300.model.vectors_vocab.npy with mmap=r\n", - "2021-12-03 09:20:48,574 : MainThread : INFO : loading vectors_ngrams from /home/oborchers/.cache/huggingface/hub/fse__fasttext-crawl-subwords-300.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/fasttext-crawl-subwords-300.model.vectors_ngrams.npy with mmap=r\n", - "2021-12-03 09:20:48,576 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", - "2021-12-03 09:20:48,576 : MainThread : INFO : setting ignored attribute vectors_vocab_norm to None\n", - "2021-12-03 09:20:48,577 : MainThread : INFO : setting ignored attribute vectors_ngrams_norm to None\n", - "2021-12-03 09:20:48,578 : MainThread : INFO : setting ignored attribute buckets_word to None\n", - "2021-12-03 09:20:48,578 : MainThread : INFO : loaded /home/oborchers/.cache/huggingface/hub/fse__fasttext-crawl-subwords-300.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/fasttext-crawl-subwords-300.model\n" + "2022-04-10 20:57:47,250 : MainThread : INFO : Lock 23217094427696 acquired on /home/oborchers/.cache/huggingface/hub/fse--fasttext-wiki-news-subwords-300.main.ef21870476cd93435d83140fcf6e7171b517e337/.gitattributes.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "864c7c7069a9464d9639c232d5800b82", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1261.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:57:47,677 : MainThread : INFO : Lock 23217094427696 released on /home/oborchers/.cache/huggingface/hub/fse--fasttext-wiki-news-subwords-300.main.ef21870476cd93435d83140fcf6e7171b517e337/.gitattributes.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:57:48,088 : MainThread : INFO : Lock 23217094427936 acquired on /home/oborchers/.cache/huggingface/hub/fse--fasttext-wiki-news-subwords-300.main.ef21870476cd93435d83140fcf6e7171b517e337/README.md.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "470926f764aa4dffbdb3bd6c157f175f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=224.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:57:48,523 : MainThread : INFO : Lock 23217094427936 released on /home/oborchers/.cache/huggingface/hub/fse--fasttext-wiki-news-subwords-300.main.ef21870476cd93435d83140fcf6e7171b517e337/README.md.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:57:48,935 : MainThread : INFO : Lock 23217094427264 acquired on /home/oborchers/.cache/huggingface/hub/fse--fasttext-wiki-news-subwords-300.main.ef21870476cd93435d83140fcf6e7171b517e337/fasttext-wiki-news-subwords-300.model.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7c176bf19b684adcb8230ae8a4414c49", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=54600933.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:57:51,756 : MainThread : INFO : Lock 23217094427264 released on /home/oborchers/.cache/huggingface/hub/fse--fasttext-wiki-news-subwords-300.main.ef21870476cd93435d83140fcf6e7171b517e337/fasttext-wiki-news-subwords-300.model.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:57:52,156 : MainThread : INFO : Lock 23217094556688 acquired on /home/oborchers/.cache/huggingface/hub/fse--fasttext-wiki-news-subwords-300.main.ef21870476cd93435d83140fcf6e7171b517e337/fasttext-wiki-news-subwords-300.model.vectors.npy.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "56488deaaedf469781d620ffb3379584", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1199998928.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:58:46,239 : MainThread : INFO : Lock 23217094556688 released on /home/oborchers/.cache/huggingface/hub/fse--fasttext-wiki-news-subwords-300.main.ef21870476cd93435d83140fcf6e7171b517e337/fasttext-wiki-news-subwords-300.model.vectors.npy.lock\n", + "2022-04-10 20:58:46,240 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse--fasttext-wiki-news-subwords-300.main.ef21870476cd93435d83140fcf6e7171b517e337/fasttext-wiki-news-subwords-300.model\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:58:48,458 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse--fasttext-wiki-news-subwords-300.main.ef21870476cd93435d83140fcf6e7171b517e337/fasttext-wiki-news-subwords-300.model.vectors.npy with mmap=r\n", + "2022-04-10 20:58:48,460 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", + "2022-04-10 20:58:54,473 : MainThread : INFO : KeyedVectors lifecycle event {'fname': '/home/oborchers/.cache/huggingface/hub/fse--fasttext-wiki-news-subwords-300.main.ef21870476cd93435d83140fcf6e7171b517e337/fasttext-wiki-news-subwords-300.model', 'datetime': '2022-04-10T20:58:54.447613', 'gensim': '4.0.0', 'python': '3.8.5 (default, Sep 4 2020, 07:30:14) \\n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-173-generic-x86_64-with-glibc2.10', 'event': 'loaded'}\n", + "2022-04-10 20:58:55,333 : MainThread : INFO : Lock 23216641444928 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-100.main.7ef1c21d9bf90598a0c618c041d9817e50250183/.gitattributes.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2141c8c7b70747ae884bd66dc44a5d94", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1261.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:58:55,799 : MainThread : INFO : Lock 23216641444928 released on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-100.main.7ef1c21d9bf90598a0c618c041d9817e50250183/.gitattributes.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:58:56,225 : MainThread : INFO : Lock 23216641442000 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-100.main.7ef1c21d9bf90598a0c618c041d9817e50250183/README.md.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8bcd5651fad0427f85a76f84d1bfdd6e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=231.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:58:56,653 : MainThread : INFO : Lock 23216641442000 released on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-100.main.7ef1c21d9bf90598a0c618c041d9817e50250183/README.md.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:58:57,049 : MainThread : INFO : Lock 23216641444016 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-100.main.7ef1c21d9bf90598a0c618c041d9817e50250183/glove-twitter-100.model.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ff826923d17d4ae7bbc6da0e0d2214fb", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=68268001.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:59:00,003 : MainThread : INFO : Lock 23216641444016 released on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-100.main.7ef1c21d9bf90598a0c618c041d9817e50250183/glove-twitter-100.model.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:59:00,416 : MainThread : INFO : Lock 23216641443584 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-100.main.7ef1c21d9bf90598a0c618c041d9817e50250183/glove-twitter-100.model.vectors.npy.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8a8ab223bffd4c65b5fa16ad3658d177", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=477405728.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:59:14,692 : MainThread : INFO : Lock 23216641443584 released on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-100.main.7ef1c21d9bf90598a0c618c041d9817e50250183/glove-twitter-100.model.vectors.npy.lock\n", + "2022-04-10 20:59:14,693 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-100.main.7ef1c21d9bf90598a0c618c041d9817e50250183/glove-twitter-100.model\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:59:17,633 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-100.main.7ef1c21d9bf90598a0c618c041d9817e50250183/glove-twitter-100.model.vectors.npy with mmap=r\n", + "2022-04-10 20:59:17,634 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", + "2022-04-10 20:59:25,072 : MainThread : INFO : KeyedVectors lifecycle event {'fname': '/home/oborchers/.cache/huggingface/hub/fse--glove-twitter-100.main.7ef1c21d9bf90598a0c618c041d9817e50250183/glove-twitter-100.model', 'datetime': '2022-04-10T20:59:25.072106', 'gensim': '4.0.0', 'python': '3.8.5 (default, Sep 4 2020, 07:30:14) \\n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-173-generic-x86_64-with-glibc2.10', 'event': 'loaded'}\n", + "2022-04-10 20:59:25,890 : MainThread : INFO : Lock 23216623418336 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-200.main.72f480c107aaa58b9474ddaf45d13db2e34fa166/.gitattributes.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9d9ab714b3a945b0828709f335467fe6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1261.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:59:26,350 : MainThread : INFO : Lock 23216623418336 released on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-200.main.72f480c107aaa58b9474ddaf45d13db2e34fa166/.gitattributes.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:59:26,761 : MainThread : INFO : Lock 23216623415408 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-200.main.72f480c107aaa58b9474ddaf45d13db2e34fa166/README.md.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6af903499d37402bba1c88eeac2c6921", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=231.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:59:27,193 : MainThread : INFO : Lock 23216623415408 released on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-200.main.72f480c107aaa58b9474ddaf45d13db2e34fa166/README.md.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:59:27,604 : MainThread : INFO : Lock 23216623416752 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-200.main.72f480c107aaa58b9474ddaf45d13db2e34fa166/glove-twitter-200.model.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3c5eacd4a7a24bb1968eb95004ea64e9", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=68268001.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:59:30,800 : MainThread : INFO : Lock 23216623416752 released on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-200.main.72f480c107aaa58b9474ddaf45d13db2e34fa166/glove-twitter-200.model.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 20:59:31,213 : MainThread : INFO : Lock 23216472298736 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-200.main.72f480c107aaa58b9474ddaf45d13db2e34fa166/glove-twitter-200.model.vectors.npy.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "13730b1c284c4a1fa20c8ab67e5df528", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=954811328.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:00:04,121 : MainThread : INFO : Lock 23216472298736 released on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-200.main.72f480c107aaa58b9474ddaf45d13db2e34fa166/glove-twitter-200.model.vectors.npy.lock\n", + "2022-04-10 21:00:04,123 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-200.main.72f480c107aaa58b9474ddaf45d13db2e34fa166/glove-twitter-200.model\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:00:06,943 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-200.main.72f480c107aaa58b9474ddaf45d13db2e34fa166/glove-twitter-200.model.vectors.npy with mmap=r\n", + "2022-04-10 21:00:06,945 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", + "2022-04-10 21:00:14,454 : MainThread : INFO : KeyedVectors lifecycle event {'fname': '/home/oborchers/.cache/huggingface/hub/fse--glove-twitter-200.main.72f480c107aaa58b9474ddaf45d13db2e34fa166/glove-twitter-200.model', 'datetime': '2022-04-10T21:00:14.454541', 'gensim': '4.0.0', 'python': '3.8.5 (default, Sep 4 2020, 07:30:14) \\n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-173-generic-x86_64-with-glibc2.10', 'event': 'loaded'}\n", + "2022-04-10 21:00:15,279 : MainThread : INFO : Lock 23214945516512 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-25.main.5ec1e20fb42502d60c4676070bad354ec71aa9aa/.gitattributes.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "051219f84cdd4a93a7e98bae44cf0a89", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1261.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:00:15,756 : MainThread : INFO : Lock 23214945516512 released on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-25.main.5ec1e20fb42502d60c4676070bad354ec71aa9aa/.gitattributes.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:00:16,167 : MainThread : INFO : Lock 23216668016496 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-25.main.5ec1e20fb42502d60c4676070bad354ec71aa9aa/README.md.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2381e26d86154394b1d7e6bc4f912128", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=231.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:00:16,600 : MainThread : INFO : Lock 23216668016496 released on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-25.main.5ec1e20fb42502d60c4676070bad354ec71aa9aa/README.md.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:00:17,005 : MainThread : INFO : Lock 23216668014432 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-25.main.5ec1e20fb42502d60c4676070bad354ec71aa9aa/glove-twitter-25.model.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "127c6fbcfcf540f2a8882a36228b5971", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=68268001.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:00:20,281 : MainThread : INFO : Lock 23216668014432 released on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-25.main.5ec1e20fb42502d60c4676070bad354ec71aa9aa/glove-twitter-25.model.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:00:20,688 : MainThread : INFO : Lock 23216664898048 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-25.main.5ec1e20fb42502d60c4676070bad354ec71aa9aa/glove-twitter-25.model.vectors.npy.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c0ce539dc8ab4e48b05aedc3a240d9df", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=119351528.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:00:24,281 : MainThread : INFO : Lock 23216664898048 released on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-25.main.5ec1e20fb42502d60c4676070bad354ec71aa9aa/glove-twitter-25.model.vectors.npy.lock\n", + "2022-04-10 21:00:24,282 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-25.main.5ec1e20fb42502d60c4676070bad354ec71aa9aa/glove-twitter-25.model\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:00:27,355 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-25.main.5ec1e20fb42502d60c4676070bad354ec71aa9aa/glove-twitter-25.model.vectors.npy with mmap=r\n", + "2022-04-10 21:00:27,357 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", + "2022-04-10 21:00:34,868 : MainThread : INFO : KeyedVectors lifecycle event {'fname': '/home/oborchers/.cache/huggingface/hub/fse--glove-twitter-25.main.5ec1e20fb42502d60c4676070bad354ec71aa9aa/glove-twitter-25.model', 'datetime': '2022-04-10T21:00:34.868101', 'gensim': '4.0.0', 'python': '3.8.5 (default, Sep 4 2020, 07:30:14) \\n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-173-generic-x86_64-with-glibc2.10', 'event': 'loaded'}\n", + "2022-04-10 21:00:35,704 : MainThread : INFO : Lock 23214286037584 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-50.main.38339c079845641fe59690e5a147fab348a2eb29/.gitattributes.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9620a729947742c4b9fef0afb5ad25a5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1261.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:00:36,160 : MainThread : INFO : Lock 23214286037584 released on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-50.main.38339c079845641fe59690e5a147fab348a2eb29/.gitattributes.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:00:36,561 : MainThread : INFO : Lock 23214286039216 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-50.main.38339c079845641fe59690e5a147fab348a2eb29/README.md.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "58b97191c1d045ed9baf7648d0acc35b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=231.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:00:37,020 : MainThread : INFO : Lock 23214286039216 released on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-50.main.38339c079845641fe59690e5a147fab348a2eb29/README.md.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:00:37,422 : MainThread : INFO : Lock 23214286039552 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-50.main.38339c079845641fe59690e5a147fab348a2eb29/glove-twitter-50.model.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "199dea86e81b490bb92b249d17010a32", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=68268001.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:00:40,656 : MainThread : INFO : Lock 23214286039552 released on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-50.main.38339c079845641fe59690e5a147fab348a2eb29/glove-twitter-50.model.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:00:41,055 : MainThread : INFO : Lock 23216412122656 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-50.main.38339c079845641fe59690e5a147fab348a2eb29/glove-twitter-50.model.vectors.npy.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "438058ba15654f44a14f1792a3b4e699", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=238702928.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:00:52,641 : MainThread : INFO : Lock 23216412122656 released on /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-50.main.38339c079845641fe59690e5a147fab348a2eb29/glove-twitter-50.model.vectors.npy.lock\n", + "2022-04-10 21:00:52,643 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-50.main.38339c079845641fe59690e5a147fab348a2eb29/glove-twitter-50.model\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:00:55,986 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse--glove-twitter-50.main.38339c079845641fe59690e5a147fab348a2eb29/glove-twitter-50.model.vectors.npy with mmap=r\n", + "2022-04-10 21:00:55,987 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", + "2022-04-10 21:01:03,683 : MainThread : INFO : KeyedVectors lifecycle event {'fname': '/home/oborchers/.cache/huggingface/hub/fse--glove-twitter-50.main.38339c079845641fe59690e5a147fab348a2eb29/glove-twitter-50.model', 'datetime': '2022-04-10T21:01:03.683141', 'gensim': '4.0.0', 'python': '3.8.5 (default, Sep 4 2020, 07:30:14) \\n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-173-generic-x86_64-with-glibc2.10', 'event': 'loaded'}\n", + "2022-04-10 21:01:05,705 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-100.main.3282d5e7c5e979c2411ba9703d63a46243a2047e/glove-wiki-gigaword-100.model\n", + "2022-04-10 21:01:06,823 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-100.main.3282d5e7c5e979c2411ba9703d63a46243a2047e/glove-wiki-gigaword-100.model.vectors.npy with mmap=r\n", + "2022-04-10 21:01:06,825 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", + "2022-04-10 21:01:09,401 : MainThread : INFO : KeyedVectors lifecycle event {'fname': '/home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-100.main.3282d5e7c5e979c2411ba9703d63a46243a2047e/glove-wiki-gigaword-100.model', 'datetime': '2022-04-10T21:01:09.401208', 'gensim': '4.0.0', 'python': '3.8.5 (default, Sep 4 2020, 07:30:14) \\n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-173-generic-x86_64-with-glibc2.10', 'event': 'loaded'}\n", + "2022-04-10 21:01:10,237 : MainThread : INFO : Lock 23216694088704 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-200.main.96a689f1f194ddd2615e41c852396c1fb50e5882/.gitattributes.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f7846e2c954f4fbdbe84569d39544ef1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1261.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:10,679 : MainThread : INFO : Lock 23216694088704 released on /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-200.main.96a689f1f194ddd2615e41c852396c1fb50e5882/.gitattributes.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:11,078 : MainThread : INFO : Lock 23216694091584 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-200.main.96a689f1f194ddd2615e41c852396c1fb50e5882/README.md.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "70b258cafd0745b7aba39bec655c2f80", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=231.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:11,519 : MainThread : INFO : Lock 23216694091584 released on /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-200.main.96a689f1f194ddd2615e41c852396c1fb50e5882/README.md.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:11,935 : MainThread : INFO : Lock 23216412120064 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-200.main.96a689f1f194ddd2615e41c852396c1fb50e5882/glove-wiki-gigaword-200.model.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bcb6c436e9ac41cfabe267b76def66a8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=21494764.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:13,663 : MainThread : INFO : Lock 23216412120064 released on /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-200.main.96a689f1f194ddd2615e41c852396c1fb50e5882/glove-wiki-gigaword-200.model.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:14,063 : MainThread : INFO : Lock 23217094556640 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-200.main.96a689f1f194ddd2615e41c852396c1fb50e5882/glove-wiki-gigaword-200.model.vectors.npy.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "95b0b19916474eaebf8518ba3e0299f0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=320000128.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:24,311 : MainThread : INFO : Lock 23217094556640 released on /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-200.main.96a689f1f194ddd2615e41c852396c1fb50e5882/glove-wiki-gigaword-200.model.vectors.npy.lock\n", + "2022-04-10 21:01:24,312 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-200.main.96a689f1f194ddd2615e41c852396c1fb50e5882/glove-wiki-gigaword-200.model\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:25,958 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-200.main.96a689f1f194ddd2615e41c852396c1fb50e5882/glove-wiki-gigaword-200.model.vectors.npy with mmap=r\n", + "2022-04-10 21:01:25,960 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", + "2022-04-10 21:01:28,568 : MainThread : INFO : KeyedVectors lifecycle event {'fname': '/home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-200.main.96a689f1f194ddd2615e41c852396c1fb50e5882/glove-wiki-gigaword-200.model', 'datetime': '2022-04-10T21:01:28.568647', 'gensim': '4.0.0', 'python': '3.8.5 (default, Sep 4 2020, 07:30:14) \\n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-173-generic-x86_64-with-glibc2.10', 'event': 'loaded'}\n", + "2022-04-10 21:01:29,401 : MainThread : INFO : Lock 23214988651728 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-300.main.242f9d6f62200e8b1a2aedfc22e4d673c0549add/.gitattributes.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0b948b8e85a448bca760194139512437", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1261.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:29,852 : MainThread : INFO : Lock 23214988651728 released on /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-300.main.242f9d6f62200e8b1a2aedfc22e4d673c0549add/.gitattributes.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:30,258 : MainThread : INFO : Lock 23214988650576 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-300.main.242f9d6f62200e8b1a2aedfc22e4d673c0549add/README.md.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1a111de0ebfa42e6a4e5fecd9431e092", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=231.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:30,719 : MainThread : INFO : Lock 23214988650576 released on /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-300.main.242f9d6f62200e8b1a2aedfc22e4d673c0549add/README.md.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:31,125 : MainThread : INFO : Lock 23214988649568 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-300.main.242f9d6f62200e8b1a2aedfc22e4d673c0549add/glove-wiki-gigaword-300.model.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1bb084c05dc3426387f93de876b95b69", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=21494765.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:32,894 : MainThread : INFO : Lock 23214988649568 released on /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-300.main.242f9d6f62200e8b1a2aedfc22e4d673c0549add/glove-wiki-gigaword-300.model.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:33,307 : MainThread : INFO : Lock 23214988649760 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-300.main.242f9d6f62200e8b1a2aedfc22e4d673c0549add/glove-wiki-gigaword-300.model.vectors.npy.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "192421f5c0694dbc88d2469fa14f69b9", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=480000128.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:44,154 : MainThread : INFO : Lock 23214988649760 released on /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-300.main.242f9d6f62200e8b1a2aedfc22e4d673c0549add/glove-wiki-gigaword-300.model.vectors.npy.lock\n", + "2022-04-10 21:01:44,155 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-300.main.242f9d6f62200e8b1a2aedfc22e4d673c0549add/glove-wiki-gigaword-300.model\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:45,835 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-300.main.242f9d6f62200e8b1a2aedfc22e4d673c0549add/glove-wiki-gigaword-300.model.vectors.npy with mmap=r\n", + "2022-04-10 21:01:45,836 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", + "2022-04-10 21:01:48,397 : MainThread : INFO : KeyedVectors lifecycle event {'fname': '/home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-300.main.242f9d6f62200e8b1a2aedfc22e4d673c0549add/glove-wiki-gigaword-300.model', 'datetime': '2022-04-10T21:01:48.397775', 'gensim': '4.0.0', 'python': '3.8.5 (default, Sep 4 2020, 07:30:14) \\n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-173-generic-x86_64-with-glibc2.10', 'event': 'loaded'}\n", + "2022-04-10 21:01:50,442 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-50.main.d2d3bc131d1c28de59b055d6724c742bda902bcf/glove-wiki-gigaword-50.model\n", + "2022-04-10 21:01:52,399 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-50.main.d2d3bc131d1c28de59b055d6724c742bda902bcf/glove-wiki-gigaword-50.model.vectors.npy with mmap=r\n", + "2022-04-10 21:01:52,401 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", + "2022-04-10 21:01:54,992 : MainThread : INFO : KeyedVectors lifecycle event {'fname': '/home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-50.main.d2d3bc131d1c28de59b055d6724c742bda902bcf/glove-wiki-gigaword-50.model', 'datetime': '2022-04-10T21:01:54.992864', 'gensim': '4.0.0', 'python': '3.8.5 (default, Sep 4 2020, 07:30:14) \\n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-173-generic-x86_64-with-glibc2.10', 'event': 'loaded'}\n", + "2022-04-10 21:01:55,801 : MainThread : INFO : Lock 23216929418352 acquired on /home/oborchers/.cache/huggingface/hub/fse--paragram-25.main.d27454408fa98c7bf128e58602f66775d23d532c/.gitattributes.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c67ee87894cc4af4a73de5456d6c17a6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1261.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:56,256 : MainThread : INFO : Lock 23216929418352 released on /home/oborchers/.cache/huggingface/hub/fse--paragram-25.main.d27454408fa98c7bf128e58602f66775d23d532c/.gitattributes.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:56,669 : MainThread : INFO : Lock 23216664896272 acquired on /home/oborchers/.cache/huggingface/hub/fse--paragram-25.main.d27454408fa98c7bf128e58602f66775d23d532c/.ipynb_checkpoints/README-checkpoint.md.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1260e3e9247b4206bac08857607069a3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=231.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:57,105 : MainThread : INFO : Lock 23216664896272 released on /home/oborchers/.cache/huggingface/hub/fse--paragram-25.main.d27454408fa98c7bf128e58602f66775d23d532c/.ipynb_checkpoints/README-checkpoint.md.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:57,501 : MainThread : INFO : Lock 23216664898384 acquired on /home/oborchers/.cache/huggingface/hub/fse--paragram-25.main.d27454408fa98c7bf128e58602f66775d23d532c/README.md.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f372e180de6841a4acf8c1089609be1e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=231.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:57,949 : MainThread : INFO : Lock 23216664898384 released on /home/oborchers/.cache/huggingface/hub/fse--paragram-25.main.d27454408fa98c7bf128e58602f66775d23d532c/README.md.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:01:58,357 : MainThread : INFO : Lock 23216664899296 acquired on /home/oborchers/.cache/huggingface/hub/fse--paragram-25.main.d27454408fa98c7bf128e58602f66775d23d532c/paragram-25.model.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "cc974ec8e9c44265b067cf4372f3aae1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=19962648.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:02:00,634 : MainThread : INFO : Lock 23216664899296 released on /home/oborchers/.cache/huggingface/hub/fse--paragram-25.main.d27454408fa98c7bf128e58602f66775d23d532c/paragram-25.model.lock\n", + "2022-04-10 21:02:00,636 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse--paragram-25.main.d27454408fa98c7bf128e58602f66775d23d532c/paragram-25.model\n", + "2022-04-10 21:02:01,210 : MainThread : INFO : setting ignored attribute vectors_norm to None\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:02:01,849 : MainThread : INFO : KeyedVectors lifecycle event {'fname': '/home/oborchers/.cache/huggingface/hub/fse--paragram-25.main.d27454408fa98c7bf128e58602f66775d23d532c/paragram-25.model', 'datetime': '2022-04-10T21:02:01.849070', 'gensim': '4.0.0', 'python': '3.8.5 (default, Sep 4 2020, 07:30:14) \\n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-173-generic-x86_64-with-glibc2.10', 'event': 'loaded'}\n", + "2022-04-10 21:02:02,660 : MainThread : INFO : Lock 23216850738144 acquired on /home/oborchers/.cache/huggingface/hub/fse--paragram-300-sl999.main.d16350f324c00fadb0f7ed05f8a9df130d950aab/.gitattributes.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a6dffddeb87f41f0ae71a40e527515f7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1261.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:02:03,115 : MainThread : INFO : Lock 23216850738144 released on /home/oborchers/.cache/huggingface/hub/fse--paragram-300-sl999.main.d16350f324c00fadb0f7ed05f8a9df130d950aab/.gitattributes.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:02:03,523 : MainThread : INFO : Lock 23216850738912 acquired on /home/oborchers/.cache/huggingface/hub/fse--paragram-300-sl999.main.d16350f324c00fadb0f7ed05f8a9df130d950aab/README.md.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4952f3a412de4b13bf730a37acdf1f9d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=172.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:02:03,953 : MainThread : INFO : Lock 23216850738912 released on /home/oborchers/.cache/huggingface/hub/fse--paragram-300-sl999.main.d16350f324c00fadb0f7ed05f8a9df130d950aab/README.md.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:02:04,349 : MainThread : INFO : Lock 23216850737424 acquired on /home/oborchers/.cache/huggingface/hub/fse--paragram-300-sl999.main.d16350f324c00fadb0f7ed05f8a9df130d950aab/paragram-300-sl999.model.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3beec26dff2144d8b432c04796c16f3a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=88857573.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:02:08,591 : MainThread : INFO : Lock 23216850737424 released on /home/oborchers/.cache/huggingface/hub/fse--paragram-300-sl999.main.d16350f324c00fadb0f7ed05f8a9df130d950aab/paragram-300-sl999.model.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:02:09,005 : MainThread : INFO : Lock 23216947652304 acquired on /home/oborchers/.cache/huggingface/hub/fse--paragram-300-sl999.main.d16350f324c00fadb0f7ed05f8a9df130d950aab/paragram-300-sl999.model.vectors.npy.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c6e0794106b94c2ba9d7d8c6aa3be24c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=2044507328.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:02:57,092 : MainThread : INFO : Lock 23216947652304 released on /home/oborchers/.cache/huggingface/hub/fse--paragram-300-sl999.main.d16350f324c00fadb0f7ed05f8a9df130d950aab/paragram-300-sl999.model.vectors.npy.lock\n", + "2022-04-10 21:02:57,094 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse--paragram-300-sl999.main.d16350f324c00fadb0f7ed05f8a9df130d950aab/paragram-300-sl999.model\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:03:02,493 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse--paragram-300-sl999.main.d16350f324c00fadb0f7ed05f8a9df130d950aab/paragram-300-sl999.model.vectors.npy with mmap=r\n", + "2022-04-10 21:03:02,496 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", + "2022-04-10 21:03:13,183 : MainThread : INFO : KeyedVectors lifecycle event {'fname': '/home/oborchers/.cache/huggingface/hub/fse--paragram-300-sl999.main.d16350f324c00fadb0f7ed05f8a9df130d950aab/paragram-300-sl999.model', 'datetime': '2022-04-10T21:03:13.183333', 'gensim': '4.0.0', 'python': '3.8.5 (default, Sep 4 2020, 07:30:14) \\n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-173-generic-x86_64-with-glibc2.10', 'event': 'loaded'}\n", + "2022-04-10 21:03:14,051 : MainThread : INFO : Lock 23214977322000 acquired on /home/oborchers/.cache/huggingface/hub/fse--paragram-300-ws353.main.cda1e084a44a24a769d595e5b6caf7ee7a8500b5/.gitattributes.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1379accab8954d369649b99363fa9596", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1261.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:03:14,525 : MainThread : INFO : Lock 23214977322000 released on /home/oborchers/.cache/huggingface/hub/fse--paragram-300-ws353.main.cda1e084a44a24a769d595e5b6caf7ee7a8500b5/.gitattributes.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:03:14,935 : MainThread : INFO : Lock 23214977320608 acquired on /home/oborchers/.cache/huggingface/hub/fse--paragram-300-ws353.main.cda1e084a44a24a769d595e5b6caf7ee7a8500b5/README.md.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fba67c5b69e24bd081f16afda36ac537", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=173.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:03:15,367 : MainThread : INFO : Lock 23214977320608 released on /home/oborchers/.cache/huggingface/hub/fse--paragram-300-ws353.main.cda1e084a44a24a769d595e5b6caf7ee7a8500b5/README.md.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:03:15,771 : MainThread : INFO : Lock 23214977321376 acquired on /home/oborchers/.cache/huggingface/hub/fse--paragram-300-ws353.main.cda1e084a44a24a769d595e5b6caf7ee7a8500b5/paragram-300-ws353.model.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "00f958d5d4114a8e8ff7e949dd326fa1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=88857573.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:03:19,348 : MainThread : INFO : Lock 23214977321376 released on /home/oborchers/.cache/huggingface/hub/fse--paragram-300-ws353.main.cda1e084a44a24a769d595e5b6caf7ee7a8500b5/paragram-300-ws353.model.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:03:19,763 : MainThread : INFO : Lock 23216828244176 acquired on /home/oborchers/.cache/huggingface/hub/fse--paragram-300-ws353.main.cda1e084a44a24a769d595e5b6caf7ee7a8500b5/paragram-300-ws353.model.vectors.npy.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0a1a18fe900e4ab4aa2899c1ba0431d3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=2044507328.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:04:38,000 : MainThread : INFO : Lock 23216828244176 released on /home/oborchers/.cache/huggingface/hub/fse--paragram-300-ws353.main.cda1e084a44a24a769d595e5b6caf7ee7a8500b5/paragram-300-ws353.model.vectors.npy.lock\n", + "2022-04-10 21:04:38,002 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse--paragram-300-ws353.main.cda1e084a44a24a769d595e5b6caf7ee7a8500b5/paragram-300-ws353.model\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:04:43,718 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse--paragram-300-ws353.main.cda1e084a44a24a769d595e5b6caf7ee7a8500b5/paragram-300-ws353.model.vectors.npy with mmap=r\n", + "2022-04-10 21:04:43,720 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", + "2022-04-10 21:04:54,408 : MainThread : INFO : KeyedVectors lifecycle event {'fname': '/home/oborchers/.cache/huggingface/hub/fse--paragram-300-ws353.main.cda1e084a44a24a769d595e5b6caf7ee7a8500b5/paragram-300-ws353.model', 'datetime': '2022-04-10T21:04:54.408109', 'gensim': '4.0.0', 'python': '3.8.5 (default, Sep 4 2020, 07:30:14) \\n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-173-generic-x86_64-with-glibc2.10', 'event': 'loaded'}\n", + "2022-04-10 21:04:55,242 : MainThread : INFO : Lock 23210948017504 acquired on /home/oborchers/.cache/huggingface/hub/fse--paranmt-300.main.ae7612b27a2516b44a42bbc148f9936332d30847/.gitattributes.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "df098de4522c4305b6ec7ea1f59e3531", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1261.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:04:55,735 : MainThread : INFO : Lock 23210948017504 released on /home/oborchers/.cache/huggingface/hub/fse--paranmt-300.main.ae7612b27a2516b44a42bbc148f9936332d30847/.gitattributes.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:04:56,133 : MainThread : INFO : Lock 23216947650816 acquired on /home/oborchers/.cache/huggingface/hub/fse--paranmt-300.main.ae7612b27a2516b44a42bbc148f9936332d30847/README.md.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bc9f230c78c44a78b8c7c0976c615f0d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=278.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:04:56,573 : MainThread : INFO : Lock 23216947650816 released on /home/oborchers/.cache/huggingface/hub/fse--paranmt-300.main.ae7612b27a2516b44a42bbc148f9936332d30847/README.md.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:04:56,987 : MainThread : INFO : Lock 23216947652304 acquired on /home/oborchers/.cache/huggingface/hub/fse--paranmt-300.main.ae7612b27a2516b44a42bbc148f9936332d30847/paranmt-300.model.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4d7ef5c57b5848c59b9ab1ba4bf6f90e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=3836889.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:04:58,204 : MainThread : INFO : Lock 23216947652304 released on /home/oborchers/.cache/huggingface/hub/fse--paranmt-300.main.ae7612b27a2516b44a42bbc148f9936332d30847/paranmt-300.model.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:04:58,601 : MainThread : INFO : Lock 23216517341584 acquired on /home/oborchers/.cache/huggingface/hub/fse--paranmt-300.main.ae7612b27a2516b44a42bbc148f9936332d30847/paranmt-300.model.vectors.npy.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0bb3625696f34b48bfafd1a4f0797441", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=92668928.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:05:02,117 : MainThread : INFO : Lock 23216517341584 released on /home/oborchers/.cache/huggingface/hub/fse--paranmt-300.main.ae7612b27a2516b44a42bbc148f9936332d30847/paranmt-300.model.vectors.npy.lock\n", + "2022-04-10 21:05:02,119 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse--paranmt-300.main.ae7612b27a2516b44a42bbc148f9936332d30847/paranmt-300.model\n", + "2022-04-10 21:05:02,277 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse--paranmt-300.main.ae7612b27a2516b44a42bbc148f9936332d30847/paranmt-300.model.vectors.npy with mmap=r\n", + "2022-04-10 21:05:02,279 : MainThread : INFO : setting ignored attribute vectors_norm to None\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:05:02,744 : MainThread : INFO : KeyedVectors lifecycle event {'fname': '/home/oborchers/.cache/huggingface/hub/fse--paranmt-300.main.ae7612b27a2516b44a42bbc148f9936332d30847/paranmt-300.model', 'datetime': '2022-04-10T21:05:02.744504', 'gensim': '4.0.0', 'python': '3.8.5 (default, Sep 4 2020, 07:30:14) \\n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-173-generic-x86_64-with-glibc2.10', 'event': 'loaded'}\n", + "2022-04-10 21:05:03,577 : MainThread : INFO : Lock 23208651680784 acquired on /home/oborchers/.cache/huggingface/hub/fse--word2vec-google-news-300.main.528f381952a0b7d777bb4a611c4a43f588d48994/.gitattributes.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c43bf4ddcdd943b593041c3ce0e8d096", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1261.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:05:04,013 : MainThread : INFO : Lock 23208651680784 released on /home/oborchers/.cache/huggingface/hub/fse--word2vec-google-news-300.main.528f381952a0b7d777bb4a611c4a43f588d48994/.gitattributes.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:05:04,413 : MainThread : INFO : Lock 23208651681552 acquired on /home/oborchers/.cache/huggingface/hub/fse--word2vec-google-news-300.main.528f381952a0b7d777bb4a611c4a43f588d48994/README.md.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2a4a97cc6f7948dba4eaa393beaae31f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=688.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:05:04,859 : MainThread : INFO : Lock 23208651681552 released on /home/oborchers/.cache/huggingface/hub/fse--word2vec-google-news-300.main.528f381952a0b7d777bb4a611c4a43f588d48994/README.md.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:05:05,272 : MainThread : INFO : Lock 23216466616128 acquired on /home/oborchers/.cache/huggingface/hub/fse--word2vec-google-news-300.main.528f381952a0b7d777bb4a611c4a43f588d48994/word2vec-google-news-300.model.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ee8e57fd3ee54340866dd53897198631", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=182007201.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:05:10,440 : MainThread : INFO : Lock 23216466616128 released on /home/oborchers/.cache/huggingface/hub/fse--word2vec-google-news-300.main.528f381952a0b7d777bb4a611c4a43f588d48994/word2vec-google-news-300.model.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:05:10,839 : MainThread : INFO : Lock 23216828246816 acquired on /home/oborchers/.cache/huggingface/hub/fse--word2vec-google-news-300.main.528f381952a0b7d777bb4a611c4a43f588d48994/word2vec-google-news-300.model.vectors.npy.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "105d0ca31c17418ebb2276b00c0445ba", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=3600000128.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:06:32,703 : MainThread : INFO : Lock 23216828246816 released on /home/oborchers/.cache/huggingface/hub/fse--word2vec-google-news-300.main.528f381952a0b7d777bb4a611c4a43f588d48994/word2vec-google-news-300.model.vectors.npy.lock\n", + "2022-04-10 21:06:32,705 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse--word2vec-google-news-300.main.528f381952a0b7d777bb4a611c4a43f588d48994/word2vec-google-news-300.model\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:06:42,794 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse--word2vec-google-news-300.main.528f381952a0b7d777bb4a611c4a43f588d48994/word2vec-google-news-300.model.vectors.npy with mmap=r\n", + "2022-04-10 21:06:42,795 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", + "2022-04-10 21:07:01,547 : MainThread : INFO : KeyedVectors lifecycle event {'fname': '/home/oborchers/.cache/huggingface/hub/fse--word2vec-google-news-300.main.528f381952a0b7d777bb4a611c4a43f588d48994/word2vec-google-news-300.model', 'datetime': '2022-04-10T21:07:01.547760', 'gensim': '4.0.0', 'python': '3.8.5 (default, Sep 4 2020, 07:30:14) \\n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-173-generic-x86_64-with-glibc2.10', 'event': 'loaded'}\n", + "2022-04-10 21:07:02,386 : MainThread : INFO : Lock 23208726395920 acquired on /home/oborchers/.cache/huggingface/hub/fse--fasttext-crawl-subwords-300.main.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/.gitattributes.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ce17c59a7e0349799ff241dcda342353", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1261.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:07:02,918 : MainThread : INFO : Lock 23208726395920 released on /home/oborchers/.cache/huggingface/hub/fse--fasttext-crawl-subwords-300.main.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/.gitattributes.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:07:03,315 : MainThread : INFO : Lock 23208726394096 acquired on /home/oborchers/.cache/huggingface/hub/fse--fasttext-crawl-subwords-300.main.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/README.md.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3afbbee431ca4fa88ba798394f6d9b8b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=199.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:07:03,742 : MainThread : INFO : Lock 23208726394096 released on /home/oborchers/.cache/huggingface/hub/fse--fasttext-crawl-subwords-300.main.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/README.md.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:07:04,156 : MainThread : INFO : Lock 23216517344896 acquired on /home/oborchers/.cache/huggingface/hub/fse--fasttext-crawl-subwords-300.main.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/fasttext-crawl-subwords-300.model.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "662b5a902e00455caa6d0533d7a4253f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=123932951.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:07:07,990 : MainThread : INFO : Lock 23216517344896 released on /home/oborchers/.cache/huggingface/hub/fse--fasttext-crawl-subwords-300.main.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/fasttext-crawl-subwords-300.model.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:07:08,405 : MainThread : INFO : Lock 23216517343600 acquired on /home/oborchers/.cache/huggingface/hub/fse--fasttext-crawl-subwords-300.main.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/fasttext-crawl-subwords-300.model.vectors.npy.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d49f2e2c0dbe4c079fedab7ba0a358c5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=2400000128.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:08:24,510 : MainThread : INFO : Lock 23216517343600 released on /home/oborchers/.cache/huggingface/hub/fse--fasttext-crawl-subwords-300.main.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/fasttext-crawl-subwords-300.model.vectors.npy.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:08:24,947 : MainThread : INFO : Lock 23217094152880 acquired on /home/oborchers/.cache/huggingface/hub/fse--fasttext-crawl-subwords-300.main.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/fasttext-crawl-subwords-300.model.vectors_ngrams.npy.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ad34fdb6061343dba27012fb17c214ae", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=2400000128.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:09:57,606 : MainThread : INFO : Lock 23217094152880 released on /home/oborchers/.cache/huggingface/hub/fse--fasttext-crawl-subwords-300.main.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/fasttext-crawl-subwords-300.model.vectors_ngrams.npy.lock\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:09:58,069 : MainThread : INFO : Lock 23216517341584 acquired on /home/oborchers/.cache/huggingface/hub/fse--fasttext-crawl-subwords-300.main.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/fasttext-crawl-subwords-300.model.vectors_vocab.npy.lock\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "73a2de2b7c3641fa9a81a1db1d59588e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=2400000128.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:11:22,020 : MainThread : INFO : Lock 23216517341584 released on /home/oborchers/.cache/huggingface/hub/fse--fasttext-crawl-subwords-300.main.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/fasttext-crawl-subwords-300.model.vectors_vocab.npy.lock\n", + "2022-04-10 21:11:22,022 : MainThread : INFO : loading FTVectors object from /home/oborchers/.cache/huggingface/hub/fse--fasttext-crawl-subwords-300.main.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/fasttext-crawl-subwords-300.model\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-10 21:11:29,456 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse--fasttext-crawl-subwords-300.main.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/fasttext-crawl-subwords-300.model.vectors.npy with mmap=r\n", + "2022-04-10 21:11:29,457 : MainThread : INFO : loading vectors_vocab from /home/oborchers/.cache/huggingface/hub/fse--fasttext-crawl-subwords-300.main.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/fasttext-crawl-subwords-300.model.vectors_vocab.npy with mmap=r\n", + "2022-04-10 21:11:29,459 : MainThread : INFO : loading vectors_ngrams from /home/oborchers/.cache/huggingface/hub/fse--fasttext-crawl-subwords-300.main.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/fasttext-crawl-subwords-300.model.vectors_ngrams.npy with mmap=r\n", + "2022-04-10 21:11:29,460 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", + "2022-04-10 21:11:29,461 : MainThread : INFO : setting ignored attribute vectors_vocab_norm to None\n", + "2022-04-10 21:11:29,461 : MainThread : INFO : setting ignored attribute vectors_ngrams_norm to None\n", + "2022-04-10 21:11:29,462 : MainThread : INFO : setting ignored attribute buckets_word to None\n", + "2022-04-10 21:11:55,060 : MainThread : INFO : FastTextKeyedVectors lifecycle event {'fname': '/home/oborchers/.cache/huggingface/hub/fse--fasttext-crawl-subwords-300.main.5db65694a7b3fde5a4f1a4c72ce96a25b931692d/fasttext-crawl-subwords-300.model', 'datetime': '2022-04-10T21:11:55.060479', 'gensim': '4.0.0', 'python': '3.8.5 (default, Sep 4 2020, 07:30:14) \\n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-173-generic-x86_64-with-glibc2.10', 'event': 'loaded'}\n" ] } ], @@ -311,7 +2201,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -340,7 +2230,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -355,22 +2245,22 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:38:51,576 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n", - "2021-12-03 09:38:52,415 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:38:52,773 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:38:53,082 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 999999 vocabulary: 1151 MB (1 GB)\n", - "2021-12-03 09:38:53,084 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:38:53,130 : MainThread : INFO : begin training\n", - "2021-12-03 09:38:53,488 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:38:53,489 : MainThread : INFO : training on 2758 effective sentences with 27172 effective words took 0s with 7686 sentences/s\n", - "2021-12-03 09:38:53,532 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:11:55,679 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n", + "2022-04-10 21:11:58,635 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:11:58,960 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:11:59,761 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 999999 vocabulary: 1151 MB (1 GB)\n", + "2022-04-10 21:11:59,762 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:11:59,799 : MainThread : INFO : begin training\n", + "2022-04-10 21:12:00,652 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:12:00,653 : MainThread : INFO : training on 2758 effective sentences with 27172 effective words took 0s with 3230 sentences/s\n", + "2022-04-10 21:12:00,692 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { @@ -384,14 +2274,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:38:54,580 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:38:54,910 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:38:55,347 : MainThread : INFO : estimated memory for 2758 sentences with 100 dimensions and 1193514 vocabulary: 460 MB (0 GB)\n", - "2021-12-03 09:38:55,348 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:38:55,381 : MainThread : INFO : begin training\n", - "2021-12-03 09:38:55,791 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:38:55,792 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 6698 sentences/s\n", - "2021-12-03 09:38:55,822 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:12:04,126 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:12:04,442 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:12:05,502 : MainThread : INFO : estimated memory for 2758 sentences with 100 dimensions and 1193514 vocabulary: 460 MB (0 GB)\n", + "2022-04-10 21:12:05,504 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:12:05,542 : MainThread : INFO : begin training\n", + "2022-04-10 21:12:05,995 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:12:05,996 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 6079 sentences/s\n", + "2022-04-10 21:12:06,025 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { @@ -405,14 +2295,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:38:56,793 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:38:57,153 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:38:57,578 : MainThread : INFO : estimated memory for 2758 sentences with 200 dimensions and 1193514 vocabulary: 917 MB (0 GB)\n", - "2021-12-03 09:38:57,580 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:38:57,629 : MainThread : INFO : begin training\n", - "2021-12-03 09:38:57,973 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:38:57,974 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 8008 sentences/s\n", - "2021-12-03 09:38:58,003 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:12:09,554 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:12:09,878 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:12:10,907 : MainThread : INFO : estimated memory for 2758 sentences with 200 dimensions and 1193514 vocabulary: 917 MB (0 GB)\n", + "2022-04-10 21:12:10,909 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:12:10,947 : MainThread : INFO : begin training\n", + "2022-04-10 21:12:11,401 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:12:11,403 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 6055 sentences/s\n", + "2022-04-10 21:12:11,434 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { @@ -426,14 +2316,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:38:59,042 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:38:59,352 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:38:59,807 : MainThread : INFO : estimated memory for 2758 sentences with 25 dimensions and 1193514 vocabulary: 118 MB (0 GB)\n", - "2021-12-03 09:38:59,808 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:38:59,841 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:00,227 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:00,228 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 7124 sentences/s\n", - "2021-12-03 09:39:00,258 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:12:14,870 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:12:15,278 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:12:16,312 : MainThread : INFO : estimated memory for 2758 sentences with 25 dimensions and 1193514 vocabulary: 118 MB (0 GB)\n", + "2022-04-10 21:12:16,314 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:12:16,349 : MainThread : INFO : begin training\n", + "2022-04-10 21:12:16,785 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:12:16,786 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 6306 sentences/s\n", + "2022-04-10 21:12:16,816 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { @@ -447,14 +2337,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:01,259 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:39:01,658 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:02,092 : MainThread : INFO : estimated memory for 2758 sentences with 50 dimensions and 1193514 vocabulary: 232 MB (0 GB)\n", - "2021-12-03 09:39:02,094 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:02,127 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:02,518 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:02,520 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 7027 sentences/s\n", - "2021-12-03 09:39:02,546 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:12:20,380 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:12:20,711 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:12:21,727 : MainThread : INFO : estimated memory for 2758 sentences with 50 dimensions and 1193514 vocabulary: 232 MB (0 GB)\n", + "2022-04-10 21:12:21,728 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:12:21,766 : MainThread : INFO : begin training\n", + "2022-04-10 21:12:22,207 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:12:22,208 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 6236 sentences/s\n", + "2022-04-10 21:12:22,239 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { @@ -468,14 +2358,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:02,920 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:39:03,234 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:03,391 : MainThread : INFO : estimated memory for 2758 sentences with 100 dimensions and 400000 vocabulary: 155 MB (0 GB)\n", - "2021-12-03 09:39:03,392 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:03,414 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:03,832 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:03,833 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 6572 sentences/s\n", - "2021-12-03 09:39:03,862 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:12:23,433 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:12:23,770 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:12:24,111 : MainThread : INFO : estimated memory for 2758 sentences with 100 dimensions and 400000 vocabulary: 155 MB (0 GB)\n", + "2022-04-10 21:12:24,112 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:12:24,132 : MainThread : INFO : begin training\n", + "2022-04-10 21:12:24,583 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:12:24,584 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 6097 sentences/s\n", + "2022-04-10 21:12:24,613 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { @@ -489,14 +2379,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:04,230 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:39:04,545 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:04,680 : MainThread : INFO : estimated memory for 2758 sentences with 200 dimensions and 400000 vocabulary: 308 MB (0 GB)\n", - "2021-12-03 09:39:04,681 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:04,703 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:05,056 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:05,057 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 7776 sentences/s\n", - "2021-12-03 09:39:05,087 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:12:25,798 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:12:26,196 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:12:26,554 : MainThread : INFO : estimated memory for 2758 sentences with 200 dimensions and 400000 vocabulary: 308 MB (0 GB)\n", + "2022-04-10 21:12:26,556 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:12:26,583 : MainThread : INFO : begin training\n", + "2022-04-10 21:12:26,953 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:12:26,954 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 7427 sentences/s\n", + "2022-04-10 21:12:26,986 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { @@ -510,14 +2400,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:05,468 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:39:05,784 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:05,909 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 400000 vocabulary: 462 MB (0 GB)\n", - "2021-12-03 09:39:05,910 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:05,933 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:06,292 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:06,293 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 7655 sentences/s\n", - "2021-12-03 09:39:06,328 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:12:28,279 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:12:28,704 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:12:29,062 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 400000 vocabulary: 462 MB (0 GB)\n", + "2022-04-10 21:12:29,063 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:12:29,084 : MainThread : INFO : begin training\n", + "2022-04-10 21:12:29,535 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:12:29,536 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 6099 sentences/s\n", + "2022-04-10 21:12:29,570 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { @@ -531,15 +2421,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:06,719 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:39:07,042 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:07,179 : MainThread : INFO : estimated memory for 2758 sentences with 50 dimensions and 400000 vocabulary: 78 MB (0 GB)\n", - "2021-12-03 09:39:07,180 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:07,202 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:07,630 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:07,630 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 6433 sentences/s\n", - "2021-12-03 09:39:07,654 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n", - "2021-12-03 09:39:07,780 : MainThread : INFO : scanning all indexed sentences and their word counts\n" + "2022-04-10 21:12:30,841 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:12:31,168 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:12:31,534 : MainThread : INFO : estimated memory for 2758 sentences with 50 dimensions and 400000 vocabulary: 78 MB (0 GB)\n", + "2022-04-10 21:12:31,535 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:12:31,561 : MainThread : INFO : begin training\n", + "2022-04-10 21:12:32,037 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:12:32,039 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 5781 sentences/s\n", + "2022-04-10 21:12:32,080 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { @@ -553,13 +2442,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:08,184 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:08,236 : MainThread : INFO : estimated memory for 2758 sentences with 25 dimensions and 99958 vocabulary: 10 MB (0 GB)\n", - "2021-12-03 09:39:08,237 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:08,249 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:08,613 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:08,614 : MainThread : INFO : training on 2758 effective sentences with 27038 effective words took 0s with 7547 sentences/s\n", - "2021-12-03 09:39:08,641 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:12:32,465 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:12:32,825 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:12:32,912 : MainThread : INFO : estimated memory for 2758 sentences with 25 dimensions and 99958 vocabulary: 10 MB (0 GB)\n", + "2022-04-10 21:12:32,914 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:12:32,927 : MainThread : INFO : begin training\n", + "2022-04-10 21:12:33,386 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:12:33,387 : MainThread : INFO : training on 2758 effective sentences with 27038 effective words took 0s with 5991 sentences/s\n", + "2022-04-10 21:12:33,419 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { @@ -573,14 +2463,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:09,972 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:39:10,335 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:10,821 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 1703756 vocabulary: 1959 MB (1 GB)\n", - "2021-12-03 09:39:10,822 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:10,864 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:11,282 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:11,283 : MainThread : INFO : training on 2758 effective sentences with 27448 effective words took 0s with 6586 sentences/s\n", - "2021-12-03 09:39:11,313 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:12:38,348 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:12:38,680 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:12:40,139 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 1703756 vocabulary: 1959 MB (1 GB)\n", + "2022-04-10 21:12:40,140 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:12:40,185 : MainThread : INFO : begin training\n", + "2022-04-10 21:12:40,527 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:12:40,528 : MainThread : INFO : training on 2758 effective sentences with 27448 effective words took 0s with 8052 sentences/s\n", + "2022-04-10 21:12:40,564 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { @@ -594,15 +2484,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:12,556 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:39:12,910 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:13,457 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 1703756 vocabulary: 1959 MB (1 GB)\n", - "2021-12-03 09:39:13,459 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:13,500 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:13,887 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:13,888 : MainThread : INFO : training on 2758 effective sentences with 27448 effective words took 0s with 7107 sentences/s\n", - "2021-12-03 09:39:13,917 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n", - "2021-12-03 09:39:13,997 : MainThread : INFO : scanning all indexed sentences and their word counts\n" + "2022-04-10 21:12:45,392 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:12:45,808 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:12:47,366 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 1703756 vocabulary: 1959 MB (1 GB)\n", + "2022-04-10 21:12:47,368 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:12:47,429 : MainThread : INFO : begin training\n", + "2022-04-10 21:12:47,816 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:12:47,817 : MainThread : INFO : training on 2758 effective sentences with 27448 effective words took 0s with 7100 sentences/s\n", + "2022-04-10 21:12:47,853 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { @@ -616,13 +2505,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:14,410 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:14,450 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 77224 vocabulary: 91 MB (0 GB)\n", - "2021-12-03 09:39:14,451 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:14,471 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:14,822 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:14,823 : MainThread : INFO : training on 2758 effective sentences with 27439 effective words took 0s with 7832 sentences/s\n", - "2021-12-03 09:39:14,855 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:12:48,135 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:12:48,482 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:12:48,548 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 77224 vocabulary: 91 MB (0 GB)\n", + "2022-04-10 21:12:48,550 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:12:48,562 : MainThread : INFO : begin training\n", + "2022-04-10 21:12:49,009 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:12:49,010 : MainThread : INFO : training on 2758 effective sentences with 27439 effective words took 0s with 6163 sentences/s\n", + "2022-04-10 21:12:49,047 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { @@ -636,14 +2526,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:17,083 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:39:17,447 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:18,570 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 3000000 vocabulary: 3447 MB (3 GB)\n", - "2021-12-03 09:39:18,571 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:18,634 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:18,983 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:18,984 : MainThread : INFO : training on 2758 effective sentences with 23116 effective words took 0s with 7881 sentences/s\n", - "2021-12-03 09:39:19,014 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:12:57,961 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:12:58,290 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:13:00,987 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 3000000 vocabulary: 3447 MB (3 GB)\n", + "2022-04-10 21:13:00,988 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:13:01,078 : MainThread : INFO : begin training\n", + "2022-04-10 21:13:01,522 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:13:01,523 : MainThread : INFO : training on 2758 effective sentences with 23116 effective words took 0s with 6202 sentences/s\n", + "2022-04-10 21:13:01,562 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { @@ -657,14 +2547,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:20,654 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:39:20,954 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:21,701 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 2000000 vocabulary: 6877 MB (6 GB)\n", - "2021-12-03 09:39:21,703 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:21,749 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:22,173 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:22,174 : MainThread : INFO : training on 2758 effective sentences with 27528 effective words took 0s with 6483 sentences/s\n", - "2021-12-03 09:39:22,204 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:13:07,465 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:13:07,807 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:13:09,600 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 2000000 vocabulary: 6877 MB (6 GB)\n", + "2022-04-10 21:13:09,601 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:13:09,694 : MainThread : INFO : begin training\n", + "2022-04-10 21:13:10,149 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:13:10,150 : MainThread : INFO : training on 2758 effective sentences with 27528 effective words took 0s with 6044 sentences/s\n", + "2022-04-10 21:13:10,185 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { @@ -678,719 +2568,719 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:22,922 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:39:23,239 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:23,542 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 999999 vocabulary: 1151 MB (1 GB)\n", - "2021-12-03 09:39:23,544 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:23,556 : MainThread : INFO : pre-computing SIF weights for 999999 words\n", - "2021-12-03 09:39:24,217 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:24,635 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:24,762 : MainThread : INFO : computing 10 principal components took 0s\n", - "2021-12-03 09:39:24,770 : MainThread : INFO : removing 10 principal components took 0s\n", - "2021-12-03 09:39:24,772 : MainThread : INFO : training on 2758 effective sentences with 27172 effective words took 0s with 6581 sentences/s\n", - "2021-12-03 09:39:24,879 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:13:13,168 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:13:13,491 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:13:14,321 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 999999 vocabulary: 1151 MB (1 GB)\n", + "2022-04-10 21:13:14,322 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:13:14,335 : MainThread : INFO : pre-computing SIF weights for 999999 words\n", + "2022-04-10 21:13:15,743 : MainThread : INFO : begin training\n", + "2022-04-10 21:13:16,200 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:13:16,271 : MainThread : INFO : computing 10 principal components took 0s\n", + "2022-04-10 21:13:16,281 : MainThread : INFO : removing 10 principal components took 0s\n", + "2022-04-10 21:13:16,283 : MainThread : INFO : training on 2758 effective sentences with 27172 effective words took 0s with 6019 sentences/s\n", + "2022-04-10 21:13:16,337 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "SIF-10 components 72.24\n" + "SIF-10 components 72.29\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:25,851 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:39:26,401 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:26,839 : MainThread : INFO : estimated memory for 2758 sentences with 100 dimensions and 1193514 vocabulary: 460 MB (0 GB)\n", - "2021-12-03 09:39:26,840 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:26,857 : MainThread : INFO : pre-computing SIF weights for 1193514 words\n", - "2021-12-03 09:39:27,748 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:28,085 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:28,469 : MainThread : INFO : computing 10 principal components took 0s\n", - "2021-12-03 09:39:28,472 : MainThread : INFO : removing 10 principal components took 0s\n", - "2021-12-03 09:39:28,473 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 8163 sentences/s\n", - "2021-12-03 09:39:28,520 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:13:19,886 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:13:20,221 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:13:21,296 : MainThread : INFO : estimated memory for 2758 sentences with 100 dimensions and 1193514 vocabulary: 460 MB (0 GB)\n", + "2022-04-10 21:13:21,298 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:13:21,308 : MainThread : INFO : pre-computing SIF weights for 1193514 words\n", + "2022-04-10 21:13:22,885 : MainThread : INFO : begin training\n", + "2022-04-10 21:13:23,321 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:13:23,376 : MainThread : INFO : computing 10 principal components took 0s\n", + "2022-04-10 21:13:23,382 : MainThread : INFO : removing 10 principal components took 0s\n", + "2022-04-10 21:13:23,383 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 6303 sentences/s\n", + "2022-04-10 21:13:23,453 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "SIF-10 components 69.70\n" + "SIF-10 components 69.65\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:29,506 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:39:29,820 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:30,251 : MainThread : INFO : estimated memory for 2758 sentences with 200 dimensions and 1193514 vocabulary: 917 MB (0 GB)\n", - "2021-12-03 09:39:30,252 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:30,275 : MainThread : INFO : pre-computing SIF weights for 1193514 words\n", - "2021-12-03 09:39:31,141 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:31,528 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:32,031 : MainThread : INFO : computing 10 principal components took 0s\n", - "2021-12-03 09:39:32,040 : MainThread : INFO : removing 10 principal components took 0s\n", - "2021-12-03 09:39:32,042 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 7103 sentences/s\n", - "2021-12-03 09:39:32,152 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:13:27,048 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:13:27,374 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:13:28,484 : MainThread : INFO : estimated memory for 2758 sentences with 200 dimensions and 1193514 vocabulary: 917 MB (0 GB)\n", + "2022-04-10 21:13:28,486 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:13:28,500 : MainThread : INFO : pre-computing SIF weights for 1193514 words\n", + "2022-04-10 21:13:30,148 : MainThread : INFO : begin training\n", + "2022-04-10 21:13:30,587 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:13:30,645 : MainThread : INFO : computing 10 principal components took 0s\n", + "2022-04-10 21:13:30,652 : MainThread : INFO : removing 10 principal components took 0s\n", + "2022-04-10 21:13:30,653 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 6265 sentences/s\n", + "2022-04-10 21:13:30,749 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "SIF-10 components 71.67\n" + "SIF-10 components 71.62\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:33,069 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:39:33,387 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:33,821 : MainThread : INFO : estimated memory for 2758 sentences with 25 dimensions and 1193514 vocabulary: 118 MB (0 GB)\n", - "2021-12-03 09:39:33,823 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:33,832 : MainThread : INFO : pre-computing SIF weights for 1193514 words\n", - "2021-12-03 09:39:34,741 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:35,127 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:35,330 : MainThread : INFO : computing 10 principal components took 0s\n", - "2021-12-03 09:39:35,333 : MainThread : INFO : removing 10 principal components took 0s\n", - "2021-12-03 09:39:35,335 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 7119 sentences/s\n", - "2021-12-03 09:39:35,363 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:13:34,166 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:13:34,551 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:13:35,585 : MainThread : INFO : estimated memory for 2758 sentences with 25 dimensions and 1193514 vocabulary: 118 MB (0 GB)\n", + "2022-04-10 21:13:35,586 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:13:35,598 : MainThread : INFO : pre-computing SIF weights for 1193514 words\n", + "2022-04-10 21:13:37,202 : MainThread : INFO : begin training\n", + "2022-04-10 21:13:37,670 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:13:37,702 : MainThread : INFO : computing 10 principal components took 0s\n", + "2022-04-10 21:13:37,704 : MainThread : INFO : removing 10 principal components took 0s\n", + "2022-04-10 21:13:37,705 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 5870 sentences/s\n", + "2022-04-10 21:13:37,771 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "SIF-10 components 54.42\n" + "SIF-10 components 54.16\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:36,345 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:39:36,660 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:37,123 : MainThread : INFO : estimated memory for 2758 sentences with 50 dimensions and 1193514 vocabulary: 232 MB (0 GB)\n", - "2021-12-03 09:39:37,124 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:37,139 : MainThread : INFO : pre-computing SIF weights for 1193514 words\n", - "2021-12-03 09:39:38,051 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:38,394 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:38,772 : MainThread : INFO : computing 10 principal components took 0s\n", - "2021-12-03 09:39:38,776 : MainThread : INFO : removing 10 principal components took 0s\n", - "2021-12-03 09:39:38,776 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 7998 sentences/s\n", - "2021-12-03 09:39:38,849 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:13:41,419 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:13:41,858 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:13:42,942 : MainThread : INFO : estimated memory for 2758 sentences with 50 dimensions and 1193514 vocabulary: 232 MB (0 GB)\n", + "2022-04-10 21:13:42,943 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:13:42,957 : MainThread : INFO : pre-computing SIF weights for 1193514 words\n", + "2022-04-10 21:13:44,559 : MainThread : INFO : begin training\n", + "2022-04-10 21:13:44,993 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:13:45,032 : MainThread : INFO : computing 10 principal components took 0s\n", + "2022-04-10 21:13:45,035 : MainThread : INFO : removing 10 principal components took 0s\n", + "2022-04-10 21:13:45,036 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 6326 sentences/s\n", + "2022-04-10 21:13:45,106 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "SIF-10 components 65.57\n" + "SIF-10 components 65.52\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:39,230 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:39:39,540 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:39,692 : MainThread : INFO : estimated memory for 2758 sentences with 100 dimensions and 400000 vocabulary: 155 MB (0 GB)\n", - "2021-12-03 09:39:39,694 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:39,705 : MainThread : INFO : pre-computing SIF weights for 400000 words\n", - "2021-12-03 09:39:40,018 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:40,360 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:40,572 : MainThread : INFO : computing 10 principal components took 0s\n", - "2021-12-03 09:39:40,576 : MainThread : INFO : removing 10 principal components took 0s\n", - "2021-12-03 09:39:40,576 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 8032 sentences/s\n", - "2021-12-03 09:39:40,668 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:13:46,444 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:13:46,859 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:13:47,256 : MainThread : INFO : estimated memory for 2758 sentences with 100 dimensions and 400000 vocabulary: 155 MB (0 GB)\n", + "2022-04-10 21:13:47,257 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:13:47,275 : MainThread : INFO : pre-computing SIF weights for 400000 words\n", + "2022-04-10 21:13:47,865 : MainThread : INFO : begin training\n", + "2022-04-10 21:13:48,261 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:13:48,301 : MainThread : INFO : computing 10 principal components took 0s\n", + "2022-04-10 21:13:48,304 : MainThread : INFO : removing 10 principal components took 0s\n", + "2022-04-10 21:13:48,306 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 6945 sentences/s\n", + "2022-04-10 21:13:48,369 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "SIF-10 components 68.43\n" + "SIF-10 components 68.34\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:41,034 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:39:41,353 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:41,504 : MainThread : INFO : estimated memory for 2758 sentences with 200 dimensions and 400000 vocabulary: 308 MB (0 GB)\n", - "2021-12-03 09:39:41,505 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:41,524 : MainThread : INFO : pre-computing SIF weights for 400000 words\n", - "2021-12-03 09:39:41,842 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:42,198 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:42,354 : MainThread : INFO : computing 10 principal components took 0s\n", - "2021-12-03 09:39:42,360 : MainThread : INFO : removing 10 principal components took 0s\n", - "2021-12-03 09:39:42,361 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 7731 sentences/s\n", - "2021-12-03 09:39:42,406 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:13:49,669 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:13:50,006 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:13:50,414 : MainThread : INFO : estimated memory for 2758 sentences with 200 dimensions and 400000 vocabulary: 308 MB (0 GB)\n", + "2022-04-10 21:13:50,415 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:13:50,429 : MainThread : INFO : pre-computing SIF weights for 400000 words\n", + "2022-04-10 21:13:50,979 : MainThread : INFO : begin training\n", + "2022-04-10 21:13:51,412 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:13:51,470 : MainThread : INFO : computing 10 principal components took 0s\n", + "2022-04-10 21:13:51,475 : MainThread : INFO : removing 10 principal components took 0s\n", + "2022-04-10 21:13:51,476 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 6349 sentences/s\n", + "2022-04-10 21:13:51,514 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "SIF-10 components 70.73\n" + "SIF-10 components 70.62\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:42,828 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:39:43,151 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:43,294 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 400000 vocabulary: 462 MB (0 GB)\n", - "2021-12-03 09:39:43,296 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:43,310 : MainThread : INFO : pre-computing SIF weights for 400000 words\n", - "2021-12-03 09:39:43,579 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:43,928 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:44,268 : MainThread : INFO : computing 10 principal components took 0s\n", - "2021-12-03 09:39:44,274 : MainThread : INFO : removing 10 principal components took 0s\n", - "2021-12-03 09:39:44,276 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 7869 sentences/s\n", - "2021-12-03 09:39:44,361 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:13:52,767 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:13:53,132 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:13:53,483 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 400000 vocabulary: 462 MB (0 GB)\n", + "2022-04-10 21:13:53,484 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:13:53,497 : MainThread : INFO : pre-computing SIF weights for 400000 words\n", + "2022-04-10 21:13:54,037 : MainThread : INFO : begin training\n", + "2022-04-10 21:13:54,464 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:13:54,522 : MainThread : INFO : computing 10 principal components took 0s\n", + "2022-04-10 21:13:54,528 : MainThread : INFO : removing 10 principal components took 0s\n", + "2022-04-10 21:13:54,530 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 6437 sentences/s\n", + "2022-04-10 21:13:54,627 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "SIF-10 components 71.43\n" + "SIF-10 components 71.35\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:44,750 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:39:45,062 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:45,198 : MainThread : INFO : estimated memory for 2758 sentences with 50 dimensions and 400000 vocabulary: 78 MB (0 GB)\n", - "2021-12-03 09:39:45,200 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:45,215 : MainThread : INFO : pre-computing SIF weights for 400000 words\n", - "2021-12-03 09:39:45,479 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:45,828 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:45,872 : MainThread : INFO : computing 10 principal components took 0s\n", - "2021-12-03 09:39:45,876 : MainThread : INFO : removing 10 principal components took 0s\n", - "2021-12-03 09:39:45,877 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 7873 sentences/s\n", - "2021-12-03 09:39:45,927 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n", - "2021-12-03 09:39:46,055 : MainThread : INFO : scanning all indexed sentences and their word counts\n" + "2022-04-10 21:13:55,809 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:13:56,212 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:13:56,551 : MainThread : INFO : estimated memory for 2758 sentences with 50 dimensions and 400000 vocabulary: 78 MB (0 GB)\n", + "2022-04-10 21:13:56,552 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:13:56,566 : MainThread : INFO : pre-computing SIF weights for 400000 words\n", + "2022-04-10 21:13:57,101 : MainThread : INFO : begin training\n", + "2022-04-10 21:13:57,539 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:13:57,585 : MainThread : INFO : computing 10 principal components took 0s\n", + "2022-04-10 21:13:57,590 : MainThread : INFO : removing 10 principal components took 0s\n", + "2022-04-10 21:13:57,591 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 6273 sentences/s\n", + "2022-04-10 21:13:57,653 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "SIF-10 components 64.20\n" + "SIF-10 components 64.11\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:46,365 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:46,407 : MainThread : INFO : estimated memory for 2758 sentences with 25 dimensions and 99958 vocabulary: 10 MB (0 GB)\n", - "2021-12-03 09:39:46,409 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:46,427 : MainThread : INFO : pre-computing SIF weights for 99958 words\n", - "2021-12-03 09:39:46,504 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:46,848 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:46,879 : MainThread : INFO : computing 10 principal components took 0s\n", - "2021-12-03 09:39:46,882 : MainThread : INFO : removing 10 principal components took 0s\n", - "2021-12-03 09:39:46,883 : MainThread : INFO : training on 2758 effective sentences with 27038 effective words took 0s with 8005 sentences/s\n", - "2021-12-03 09:39:46,936 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:13:58,056 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:13:58,385 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:13:58,483 : MainThread : INFO : estimated memory for 2758 sentences with 25 dimensions and 99958 vocabulary: 10 MB (0 GB)\n", + "2022-04-10 21:13:58,484 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:13:58,495 : MainThread : INFO : pre-computing SIF weights for 99958 words\n", + "2022-04-10 21:13:58,658 : MainThread : INFO : begin training\n", + "2022-04-10 21:13:59,121 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:13:59,173 : MainThread : INFO : computing 10 principal components took 0s\n", + "2022-04-10 21:13:59,179 : MainThread : INFO : removing 10 principal components took 0s\n", + "2022-04-10 21:13:59,180 : MainThread : INFO : training on 2758 effective sentences with 27038 effective words took 0s with 5945 sentences/s\n", + "2022-04-10 21:13:59,283 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "SIF-10 components 59.22\n" + "SIF-10 components 59.07\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:48,304 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:39:48,677 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:49,160 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 1703756 vocabulary: 1959 MB (1 GB)\n", - "2021-12-03 09:39:49,161 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:49,174 : MainThread : INFO : pre-computing SIF weights for 1703756 words\n", - "2021-12-03 09:39:50,270 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:50,723 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:50,788 : MainThread : INFO : computing 10 principal components took 0s\n", - "2021-12-03 09:39:50,795 : MainThread : INFO : removing 10 principal components took 0s\n", - "2021-12-03 09:39:50,797 : MainThread : INFO : training on 2758 effective sentences with 27448 effective words took 0s with 6082 sentences/s\n", - "2021-12-03 09:39:50,884 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:14:04,259 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:14:04,598 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:14:06,100 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 1703756 vocabulary: 1959 MB (1 GB)\n", + "2022-04-10 21:14:06,101 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:14:06,114 : MainThread : INFO : pre-computing SIF weights for 1703756 words\n", + "2022-04-10 21:14:08,290 : MainThread : INFO : begin training\n", + "2022-04-10 21:14:08,733 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:14:08,787 : MainThread : INFO : computing 10 principal components took 0s\n", + "2022-04-10 21:14:08,792 : MainThread : INFO : removing 10 principal components took 0s\n", + "2022-04-10 21:14:08,793 : MainThread : INFO : training on 2758 effective sentences with 27448 effective words took 0s with 6210 sentences/s\n", + "2022-04-10 21:14:08,842 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "SIF-10 components 74.27\n" + "SIF-10 components 74.21\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:52,148 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:39:52,458 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:53,007 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 1703756 vocabulary: 1959 MB (1 GB)\n", - "2021-12-03 09:39:53,009 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:53,021 : MainThread : INFO : pre-computing SIF weights for 1703756 words\n", - "2021-12-03 09:39:54,176 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:54,630 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:54,710 : MainThread : INFO : computing 10 principal components took 0s\n", - "2021-12-03 09:39:54,717 : MainThread : INFO : removing 10 principal components took 0s\n", - "2021-12-03 09:39:54,718 : MainThread : INFO : training on 2758 effective sentences with 27448 effective words took 0s with 6055 sentences/s\n", - "2021-12-03 09:39:54,804 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n", - "2021-12-03 09:39:54,889 : MainThread : INFO : scanning all indexed sentences and their word counts\n" + "2022-04-10 21:14:13,811 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:14:14,145 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:14:15,566 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 1703756 vocabulary: 1959 MB (1 GB)\n", + "2022-04-10 21:14:15,568 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:14:15,590 : MainThread : INFO : pre-computing SIF weights for 1703756 words\n", + "2022-04-10 21:14:17,860 : MainThread : INFO : begin training\n", + "2022-04-10 21:14:18,305 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:14:18,339 : MainThread : INFO : computing 10 principal components took 0s\n", + "2022-04-10 21:14:18,346 : MainThread : INFO : removing 10 principal components took 0s\n", + "2022-04-10 21:14:18,347 : MainThread : INFO : training on 2758 effective sentences with 27448 effective words took 0s with 6181 sentences/s\n", + "2022-04-10 21:14:18,439 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "SIF-10 components 74.08\n" + "SIF-10 components 74.03\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:55,207 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:55,245 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 77224 vocabulary: 91 MB (0 GB)\n", - "2021-12-03 09:39:55,248 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:55,260 : MainThread : INFO : pre-computing SIF weights for 77224 words\n", - "2021-12-03 09:39:55,325 : MainThread : INFO : begin training\n", - "2021-12-03 09:39:55,674 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:39:55,740 : MainThread : INFO : computing 10 principal components took 0s\n", - "2021-12-03 09:39:55,745 : MainThread : INFO : removing 10 principal components took 0s\n", - "2021-12-03 09:39:55,746 : MainThread : INFO : training on 2758 effective sentences with 27439 effective words took 0s with 7859 sentences/s\n", - "2021-12-03 09:39:55,819 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:14:18,742 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:14:19,121 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:14:19,190 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 77224 vocabulary: 91 MB (0 GB)\n", + "2022-04-10 21:14:19,191 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:14:19,202 : MainThread : INFO : pre-computing SIF weights for 77224 words\n", + "2022-04-10 21:14:19,305 : MainThread : INFO : begin training\n", + "2022-04-10 21:14:19,761 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:14:19,807 : MainThread : INFO : computing 10 principal components took 0s\n", + "2022-04-10 21:14:19,812 : MainThread : INFO : removing 10 principal components took 0s\n", + "2022-04-10 21:14:19,813 : MainThread : INFO : training on 2758 effective sentences with 27439 effective words took 0s with 6035 sentences/s\n", + "2022-04-10 21:14:19,908 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "SIF-10 components 76.76\n" + "SIF-10 components 76.72\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:39:57,877 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:39:58,183 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:39:59,233 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 3000000 vocabulary: 3447 MB (3 GB)\n", - "2021-12-03 09:39:59,235 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:39:59,245 : MainThread : INFO : pre-computing SIF weights for 3000000 words\n", - "2021-12-03 09:40:01,380 : MainThread : INFO : begin training\n", - "2021-12-03 09:40:01,757 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:40:01,937 : MainThread : INFO : computing 10 principal components took 0s\n", - "2021-12-03 09:40:01,942 : MainThread : INFO : removing 10 principal components took 0s\n", - "2021-12-03 09:40:01,944 : MainThread : INFO : training on 2758 effective sentences with 23116 effective words took 0s with 7303 sentences/s\n", - "2021-12-03 09:40:02,030 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:14:28,708 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:14:29,029 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:14:31,697 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 3000000 vocabulary: 3447 MB (3 GB)\n", + "2022-04-10 21:14:31,698 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:14:31,711 : MainThread : INFO : pre-computing SIF weights for 3000000 words\n", + "2022-04-10 21:14:35,675 : MainThread : INFO : begin training\n", + "2022-04-10 21:14:36,114 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:14:36,165 : MainThread : INFO : computing 10 principal components took 0s\n", + "2022-04-10 21:14:36,169 : MainThread : INFO : removing 10 principal components took 0s\n", + "2022-04-10 21:14:36,170 : MainThread : INFO : training on 2758 effective sentences with 23116 effective words took 0s with 6267 sentences/s\n", + "2022-04-10 21:14:36,275 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "SIF-10 components 71.17\n" + "SIF-10 components 71.12\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:40:03,585 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:40:03,896 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:40:04,677 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 2000000 vocabulary: 6877 MB (6 GB)\n", - "2021-12-03 09:40:04,678 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:40:04,698 : MainThread : INFO : pre-computing SIF weights for 2000000 words\n", - "2021-12-03 09:40:06,200 : MainThread : INFO : begin training\n", - "2021-12-03 09:40:06,542 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:40:06,883 : MainThread : INFO : computing 10 principal components took 0s\n", - "2021-12-03 09:40:06,887 : MainThread : INFO : removing 10 principal components took 0s\n", - "2021-12-03 09:40:06,888 : MainThread : INFO : training on 2758 effective sentences with 27528 effective words took 0s with 8032 sentences/s\n", - "2021-12-03 09:40:06,975 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:14:42,225 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:14:42,659 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:14:44,479 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 2000000 vocabulary: 6877 MB (6 GB)\n", + "2022-04-10 21:14:44,481 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:14:44,495 : MainThread : INFO : pre-computing SIF weights for 2000000 words\n", + "2022-04-10 21:14:47,243 : MainThread : INFO : begin training\n", + "2022-04-10 21:14:47,689 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:14:47,750 : MainThread : INFO : computing 10 principal components took 0s\n", + "2022-04-10 21:14:47,758 : MainThread : INFO : removing 10 principal components took 0s\n", + "2022-04-10 21:14:47,761 : MainThread : INFO : training on 2758 effective sentences with 27528 effective words took 0s with 6177 sentences/s\n", + "2022-04-10 21:14:47,871 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "SIF-10 components 73.54\n" + "SIF-10 components 73.38\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:40:07,766 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:40:08,092 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:40:08,387 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 999999 vocabulary: 1151 MB (1 GB)\n", - "2021-12-03 09:40:08,389 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:40:08,402 : MainThread : INFO : pre-computing uSIF weights for 999999 words\n", - "2021-12-03 09:40:10,809 : MainThread : INFO : begin training\n", - "2021-12-03 09:40:11,187 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:40:11,240 : MainThread : INFO : computing 5 principal components took 0s\n", - "2021-12-03 09:40:11,243 : MainThread : INFO : removing 5 principal components took 0s\n", - "2021-12-03 09:40:11,245 : MainThread : INFO : training on 2758 effective sentences with 27172 effective words took 0s with 7261 sentences/s\n", - "2021-12-03 09:40:11,350 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:14:50,778 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:14:51,104 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:14:51,910 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 999999 vocabulary: 1151 MB (1 GB)\n", + "2022-04-10 21:14:51,911 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:14:51,926 : MainThread : INFO : pre-computing uSIF weights for 999999 words\n", + "2022-04-10 21:14:55,212 : MainThread : INFO : begin training\n", + "2022-04-10 21:14:55,686 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:14:55,770 : MainThread : INFO : computing 5 principal components took 0s\n", + "2022-04-10 21:14:55,774 : MainThread : INFO : removing 5 principal components took 0s\n", + "2022-04-10 21:14:55,777 : MainThread : INFO : training on 2758 effective sentences with 27172 effective words took 0s with 5797 sentences/s\n", + "2022-04-10 21:14:55,826 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "uSIF length 69.05\n" + "uSIF length 68.63\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:40:12,385 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:40:12,706 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:40:13,137 : MainThread : INFO : estimated memory for 2758 sentences with 100 dimensions and 1193514 vocabulary: 460 MB (0 GB)\n", - "2021-12-03 09:40:13,139 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:40:13,148 : MainThread : INFO : pre-computing uSIF weights for 1193514 words\n", - "2021-12-03 09:40:16,097 : MainThread : INFO : begin training\n", - "2021-12-03 09:40:16,501 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:40:16,549 : MainThread : INFO : computing 5 principal components took 0s\n", - "2021-12-03 09:40:16,554 : MainThread : INFO : removing 5 principal components took 0s\n", - "2021-12-03 09:40:16,555 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 6810 sentences/s\n", - "2021-12-03 09:40:16,654 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:14:59,355 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:14:59,700 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:15:00,811 : MainThread : INFO : estimated memory for 2758 sentences with 100 dimensions and 1193514 vocabulary: 460 MB (0 GB)\n", + "2022-04-10 21:15:00,812 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:15:00,826 : MainThread : INFO : pre-computing uSIF weights for 1193514 words\n", + "2022-04-10 21:15:04,828 : MainThread : INFO : begin training\n", + "2022-04-10 21:15:05,214 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:15:05,267 : MainThread : INFO : computing 5 principal components took 0s\n", + "2022-04-10 21:15:05,274 : MainThread : INFO : removing 5 principal components took 0s\n", + "2022-04-10 21:15:05,276 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 7126 sentences/s\n", + "2022-04-10 21:15:05,368 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "uSIF length 64.22\n" + "uSIF length 64.13\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:40:17,648 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:40:17,978 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:40:18,409 : MainThread : INFO : estimated memory for 2758 sentences with 200 dimensions and 1193514 vocabulary: 917 MB (0 GB)\n", - "2021-12-03 09:40:18,411 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:40:18,435 : MainThread : INFO : pre-computing uSIF weights for 1193514 words\n", - "2021-12-03 09:40:21,629 : MainThread : INFO : begin training\n", - "2021-12-03 09:40:22,011 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:40:22,062 : MainThread : INFO : computing 5 principal components took 0s\n", - "2021-12-03 09:40:22,067 : MainThread : INFO : removing 5 principal components took 0s\n", - "2021-12-03 09:40:22,068 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 7190 sentences/s\n", - "2021-12-03 09:40:22,149 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:15:09,049 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:15:09,377 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:15:10,437 : MainThread : INFO : estimated memory for 2758 sentences with 200 dimensions and 1193514 vocabulary: 917 MB (0 GB)\n", + "2022-04-10 21:15:10,439 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:15:10,457 : MainThread : INFO : pre-computing uSIF weights for 1193514 words\n", + "2022-04-10 21:15:14,583 : MainThread : INFO : begin training\n", + "2022-04-10 21:15:15,070 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:15:15,118 : MainThread : INFO : computing 5 principal components took 0s\n", + "2022-04-10 21:15:15,122 : MainThread : INFO : removing 5 principal components took 0s\n", + "2022-04-10 21:15:15,123 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 5646 sentences/s\n", + "2022-04-10 21:15:15,198 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "uSIF length 66.73\n" + "uSIF length 66.67\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:40:23,160 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:40:23,575 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:40:24,036 : MainThread : INFO : estimated memory for 2758 sentences with 25 dimensions and 1193514 vocabulary: 118 MB (0 GB)\n", - "2021-12-03 09:40:24,037 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:40:24,048 : MainThread : INFO : pre-computing uSIF weights for 1193514 words\n", - "2021-12-03 09:40:27,007 : MainThread : INFO : begin training\n", - "2021-12-03 09:40:27,347 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:40:27,370 : MainThread : INFO : computing 5 principal components took 0s\n", - "2021-12-03 09:40:27,372 : MainThread : INFO : removing 5 principal components took 0s\n", - "2021-12-03 09:40:27,373 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 8092 sentences/s\n", - "2021-12-03 09:40:27,422 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:15:18,756 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:15:19,148 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:15:20,235 : MainThread : INFO : estimated memory for 2758 sentences with 25 dimensions and 1193514 vocabulary: 118 MB (0 GB)\n", + "2022-04-10 21:15:20,237 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:15:20,249 : MainThread : INFO : pre-computing uSIF weights for 1193514 words\n", + "2022-04-10 21:15:24,258 : MainThread : INFO : begin training\n", + "2022-04-10 21:15:24,723 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:15:24,774 : MainThread : INFO : computing 5 principal components took 0s\n", + "2022-04-10 21:15:24,779 : MainThread : INFO : removing 5 principal components took 0s\n", + "2022-04-10 21:15:24,780 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 5922 sentences/s\n", + "2022-04-10 21:15:24,868 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "uSIF length 55.17\n" + "uSIF length 55.06\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:40:28,435 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:40:28,742 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:40:29,182 : MainThread : INFO : estimated memory for 2758 sentences with 50 dimensions and 1193514 vocabulary: 232 MB (0 GB)\n", - "2021-12-03 09:40:29,183 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:40:29,196 : MainThread : INFO : pre-computing uSIF weights for 1193514 words\n", - "2021-12-03 09:40:32,178 : MainThread : INFO : begin training\n", - "2021-12-03 09:40:32,630 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:40:32,693 : MainThread : INFO : computing 5 principal components took 0s\n", - "2021-12-03 09:40:32,699 : MainThread : INFO : removing 5 principal components took 0s\n", - "2021-12-03 09:40:32,700 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 6089 sentences/s\n", - "2021-12-03 09:40:32,784 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:15:28,479 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:15:28,842 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:15:29,904 : MainThread : INFO : estimated memory for 2758 sentences with 50 dimensions and 1193514 vocabulary: 232 MB (0 GB)\n", + "2022-04-10 21:15:29,906 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:15:29,917 : MainThread : INFO : pre-computing uSIF weights for 1193514 words\n", + "2022-04-10 21:15:33,992 : MainThread : INFO : begin training\n", + "2022-04-10 21:15:34,462 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:15:34,497 : MainThread : INFO : computing 5 principal components took 0s\n", + "2022-04-10 21:15:34,500 : MainThread : INFO : removing 5 principal components took 0s\n", + "2022-04-10 21:15:34,501 : MainThread : INFO : training on 2758 effective sentences with 26828 effective words took 0s with 5850 sentences/s\n", + "2022-04-10 21:15:34,563 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "uSIF length 60.50\n" + "uSIF length 60.41\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:40:33,188 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:40:33,490 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:40:33,650 : MainThread : INFO : estimated memory for 2758 sentences with 100 dimensions and 400000 vocabulary: 155 MB (0 GB)\n", - "2021-12-03 09:40:33,651 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:40:33,661 : MainThread : INFO : pre-computing uSIF weights for 400000 words\n", - "2021-12-03 09:40:34,654 : MainThread : INFO : begin training\n", - "2021-12-03 09:40:35,036 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:40:35,082 : MainThread : INFO : computing 5 principal components took 0s\n", - "2021-12-03 09:40:35,089 : MainThread : INFO : removing 5 principal components took 0s\n", - "2021-12-03 09:40:35,090 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 7197 sentences/s\n", - "2021-12-03 09:40:35,192 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:15:35,796 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:15:36,138 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:15:36,493 : MainThread : INFO : estimated memory for 2758 sentences with 100 dimensions and 400000 vocabulary: 155 MB (0 GB)\n", + "2022-04-10 21:15:36,495 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:15:36,505 : MainThread : INFO : pre-computing uSIF weights for 400000 words\n", + "2022-04-10 21:15:37,871 : MainThread : INFO : begin training\n", + "2022-04-10 21:15:38,356 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:15:38,389 : MainThread : INFO : computing 5 principal components took 0s\n", + "2022-04-10 21:15:38,394 : MainThread : INFO : removing 5 principal components took 0s\n", + "2022-04-10 21:15:38,395 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 5679 sentences/s\n", + "2022-04-10 21:15:38,467 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "uSIF length 65.48\n" + "uSIF length 65.33\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:40:35,568 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:40:35,930 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:40:36,066 : MainThread : INFO : estimated memory for 2758 sentences with 200 dimensions and 400000 vocabulary: 308 MB (0 GB)\n", - "2021-12-03 09:40:36,067 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:40:36,082 : MainThread : INFO : pre-computing uSIF weights for 400000 words\n", - "2021-12-03 09:40:37,120 : MainThread : INFO : begin training\n", - "2021-12-03 09:40:37,467 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:40:37,549 : MainThread : INFO : computing 5 principal components took 0s\n", - "2021-12-03 09:40:37,554 : MainThread : INFO : removing 5 principal components took 0s\n", - "2021-12-03 09:40:37,555 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 7911 sentences/s\n", - "2021-12-03 09:40:37,602 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:15:39,727 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:15:40,069 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:15:40,440 : MainThread : INFO : estimated memory for 2758 sentences with 200 dimensions and 400000 vocabulary: 308 MB (0 GB)\n", + "2022-04-10 21:15:40,441 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:15:40,461 : MainThread : INFO : pre-computing uSIF weights for 400000 words\n", + "2022-04-10 21:15:41,928 : MainThread : INFO : begin training\n", + "2022-04-10 21:15:42,292 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:15:42,330 : MainThread : INFO : computing 5 principal components took 0s\n", + "2022-04-10 21:15:42,333 : MainThread : INFO : removing 5 principal components took 0s\n", + "2022-04-10 21:15:42,334 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 7556 sentences/s\n", + "2022-04-10 21:15:42,395 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "uSIF length 67.26\n" + "uSIF length 67.11\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:40:37,995 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:40:38,304 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:40:38,431 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 400000 vocabulary: 462 MB (0 GB)\n", - "2021-12-03 09:40:38,433 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:40:38,446 : MainThread : INFO : pre-computing uSIF weights for 400000 words\n", - "2021-12-03 09:40:39,394 : MainThread : INFO : begin training\n", - "2021-12-03 09:40:39,784 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:40:39,904 : MainThread : INFO : computing 5 principal components took 0s\n", - "2021-12-03 09:40:39,911 : MainThread : INFO : removing 5 principal components took 0s\n", - "2021-12-03 09:40:39,913 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 7049 sentences/s\n", - "2021-12-03 09:40:39,999 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:15:43,661 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:15:43,980 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:15:44,352 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 400000 vocabulary: 462 MB (0 GB)\n", + "2022-04-10 21:15:44,353 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:15:44,364 : MainThread : INFO : pre-computing uSIF weights for 400000 words\n", + "2022-04-10 21:15:45,705 : MainThread : INFO : begin training\n", + "2022-04-10 21:15:46,146 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:15:46,178 : MainThread : INFO : computing 5 principal components took 0s\n", + "2022-04-10 21:15:46,185 : MainThread : INFO : removing 5 principal components took 0s\n", + "2022-04-10 21:15:46,186 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 6242 sentences/s\n", + "2022-04-10 21:15:46,271 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "uSIF length 67.73\n" + "uSIF length 67.60\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:40:40,376 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:40:40,707 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:40:40,848 : MainThread : INFO : estimated memory for 2758 sentences with 50 dimensions and 400000 vocabulary: 78 MB (0 GB)\n", - "2021-12-03 09:40:40,849 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:40:40,867 : MainThread : INFO : pre-computing uSIF weights for 400000 words\n", - "2021-12-03 09:40:41,835 : MainThread : INFO : begin training\n", - "2021-12-03 09:40:42,215 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:40:42,359 : MainThread : INFO : computing 5 principal components took 0s\n", - "2021-12-03 09:40:42,363 : MainThread : INFO : removing 5 principal components took 0s\n", - "2021-12-03 09:40:42,364 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 7247 sentences/s\n", - "2021-12-03 09:40:42,391 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n", - "2021-12-03 09:40:42,515 : MainThread : INFO : scanning all indexed sentences and their word counts\n" + "2022-04-10 21:15:47,539 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:15:47,892 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:15:48,235 : MainThread : INFO : estimated memory for 2758 sentences with 50 dimensions and 400000 vocabulary: 78 MB (0 GB)\n", + "2022-04-10 21:15:48,236 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:15:48,251 : MainThread : INFO : pre-computing uSIF weights for 400000 words\n", + "2022-04-10 21:15:49,671 : MainThread : INFO : begin training\n", + "2022-04-10 21:15:50,120 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:15:50,158 : MainThread : INFO : computing 5 principal components took 0s\n", + "2022-04-10 21:15:50,162 : MainThread : INFO : removing 5 principal components took 0s\n", + "2022-04-10 21:15:50,163 : MainThread : INFO : training on 2758 effective sentences with 27410 effective words took 0s with 6123 sentences/s\n", + "2022-04-10 21:15:50,196 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "uSIF length 62.22\n" + "uSIF length 62.06\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:40:42,834 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:40:42,875 : MainThread : INFO : estimated memory for 2758 sentences with 25 dimensions and 99958 vocabulary: 10 MB (0 GB)\n", - "2021-12-03 09:40:42,876 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:40:42,889 : MainThread : INFO : pre-computing uSIF weights for 99958 words\n", - "2021-12-03 09:40:43,140 : MainThread : INFO : begin training\n", - "2021-12-03 09:40:43,489 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:40:43,519 : MainThread : INFO : computing 5 principal components took 0s\n", - "2021-12-03 09:40:43,523 : MainThread : INFO : removing 5 principal components took 0s\n", - "2021-12-03 09:40:43,524 : MainThread : INFO : training on 2758 effective sentences with 27038 effective words took 0s with 7871 sentences/s\n", - "2021-12-03 09:40:43,576 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:15:50,576 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:15:50,963 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:15:51,054 : MainThread : INFO : estimated memory for 2758 sentences with 25 dimensions and 99958 vocabulary: 10 MB (0 GB)\n", + "2022-04-10 21:15:51,055 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:15:51,067 : MainThread : INFO : pre-computing uSIF weights for 99958 words\n", + "2022-04-10 21:15:51,393 : MainThread : INFO : begin training\n", + "2022-04-10 21:15:51,823 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:15:51,843 : MainThread : INFO : computing 5 principal components took 0s\n", + "2022-04-10 21:15:51,845 : MainThread : INFO : removing 5 principal components took 0s\n", + "2022-04-10 21:15:51,846 : MainThread : INFO : training on 2758 effective sentences with 27038 effective words took 0s with 6399 sentences/s\n", + "2022-04-10 21:15:51,878 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "uSIF length 64.31\n" + "uSIF length 64.22\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:40:44,922 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:40:45,229 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:40:45,744 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 1703756 vocabulary: 1959 MB (1 GB)\n", - "2021-12-03 09:40:45,746 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:40:45,755 : MainThread : INFO : pre-computing uSIF weights for 1703756 words\n", - "2021-12-03 09:40:49,750 : MainThread : INFO : begin training\n", - "2021-12-03 09:40:50,113 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:40:50,420 : MainThread : INFO : computing 5 principal components took 0s\n", - "2021-12-03 09:40:50,428 : MainThread : INFO : removing 5 principal components took 0s\n", - "2021-12-03 09:40:50,430 : MainThread : INFO : training on 2758 effective sentences with 27448 effective words took 0s with 7576 sentences/s\n", - "2021-12-03 09:40:50,516 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:15:56,885 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:15:57,202 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:15:58,675 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 1703756 vocabulary: 1959 MB (1 GB)\n", + "2022-04-10 21:15:58,676 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:15:58,691 : MainThread : INFO : pre-computing uSIF weights for 1703756 words\n", + "2022-04-10 21:16:04,254 : MainThread : INFO : begin training\n", + "2022-04-10 21:16:04,722 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:16:04,789 : MainThread : INFO : computing 5 principal components took 0s\n", + "2022-04-10 21:16:04,798 : MainThread : INFO : removing 5 principal components took 0s\n", + "2022-04-10 21:16:04,799 : MainThread : INFO : training on 2758 effective sentences with 27448 effective words took 0s with 5878 sentences/s\n", + "2022-04-10 21:16:04,858 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "uSIF length 73.09\n" + "uSIF length 73.04\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:40:51,760 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:40:52,075 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:40:52,626 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 1703756 vocabulary: 1959 MB (1 GB)\n", - "2021-12-03 09:40:52,628 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:40:52,638 : MainThread : INFO : pre-computing uSIF weights for 1703756 words\n", - "2021-12-03 09:40:56,747 : MainThread : INFO : begin training\n", - "2021-12-03 09:40:57,191 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:40:57,403 : MainThread : INFO : computing 5 principal components took 0s\n", - "2021-12-03 09:40:57,408 : MainThread : INFO : removing 5 principal components took 0s\n", - "2021-12-03 09:40:57,409 : MainThread : INFO : training on 2758 effective sentences with 27448 effective words took 0s with 6198 sentences/s\n", - "2021-12-03 09:40:57,476 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n", - "2021-12-03 09:40:57,567 : MainThread : INFO : scanning all indexed sentences and their word counts\n" + "2022-04-10 21:16:09,746 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:16:10,075 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:16:11,520 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 1703756 vocabulary: 1959 MB (1 GB)\n", + "2022-04-10 21:16:11,522 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:16:11,539 : MainThread : INFO : pre-computing uSIF weights for 1703756 words\n", + "2022-04-10 21:16:17,121 : MainThread : INFO : begin training\n", + "2022-04-10 21:16:17,566 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:16:17,606 : MainThread : INFO : computing 5 principal components took 0s\n", + "2022-04-10 21:16:17,608 : MainThread : INFO : removing 5 principal components took 0s\n", + "2022-04-10 21:16:17,609 : MainThread : INFO : training on 2758 effective sentences with 27448 effective words took 0s with 6182 sentences/s\n", + "2022-04-10 21:16:17,704 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "uSIF length 71.90\n" + "uSIF length 71.84\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:40:57,891 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:40:57,929 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 77224 vocabulary: 91 MB (0 GB)\n", - "2021-12-03 09:40:57,930 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:40:57,943 : MainThread : INFO : pre-computing uSIF weights for 77224 words\n", - "2021-12-03 09:40:58,158 : MainThread : INFO : begin training\n", - "2021-12-03 09:40:58,493 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:40:58,751 : MainThread : INFO : computing 5 principal components took 0s\n", - "2021-12-03 09:40:58,756 : MainThread : INFO : removing 5 principal components took 0s\n", - "2021-12-03 09:40:58,757 : MainThread : INFO : training on 2758 effective sentences with 27439 effective words took 0s with 8200 sentences/s\n", - "2021-12-03 09:40:58,842 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:16:17,974 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:16:18,362 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:16:18,432 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 77224 vocabulary: 91 MB (0 GB)\n", + "2022-04-10 21:16:18,433 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:16:18,445 : MainThread : INFO : pre-computing uSIF weights for 77224 words\n", + "2022-04-10 21:16:18,706 : MainThread : INFO : begin training\n", + "2022-04-10 21:16:19,129 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:16:19,172 : MainThread : INFO : computing 5 principal components took 0s\n", + "2022-04-10 21:16:19,179 : MainThread : INFO : removing 5 principal components took 0s\n", + "2022-04-10 21:16:19,180 : MainThread : INFO : training on 2758 effective sentences with 27439 effective words took 0s with 6497 sentences/s\n", + "2022-04-10 21:16:19,265 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "uSIF length 79.02\n" + "uSIF length 79.00\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:41:01,070 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:41:01,448 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:41:02,515 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 3000000 vocabulary: 3447 MB (3 GB)\n", - "2021-12-03 09:41:02,516 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:41:02,529 : MainThread : INFO : pre-computing uSIF weights for 3000000 words\n", - "2021-12-03 09:41:09,927 : MainThread : INFO : begin training\n", - "2021-12-03 09:41:10,374 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:41:10,733 : MainThread : INFO : computing 5 principal components took 0s\n", - "2021-12-03 09:41:10,740 : MainThread : INFO : removing 5 principal components took 0s\n", - "2021-12-03 09:41:10,742 : MainThread : INFO : training on 2758 effective sentences with 23116 effective words took 0s with 6148 sentences/s\n", - "2021-12-03 09:41:10,828 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:16:28,027 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:16:28,352 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:16:31,060 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 3000000 vocabulary: 3447 MB (3 GB)\n", + "2022-04-10 21:16:31,061 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:16:31,074 : MainThread : INFO : pre-computing uSIF weights for 3000000 words\n", + "2022-04-10 21:16:41,139 : MainThread : INFO : begin training\n", + "2022-04-10 21:16:41,586 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:16:41,640 : MainThread : INFO : computing 5 principal components took 0s\n", + "2022-04-10 21:16:41,645 : MainThread : INFO : removing 5 principal components took 0s\n", + "2022-04-10 21:16:41,646 : MainThread : INFO : training on 2758 effective sentences with 23116 effective words took 0s with 6157 sentences/s\n", + "2022-04-10 21:16:41,744 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "uSIF length 67.15\n" + "uSIF length 66.99\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-03 09:41:12,490 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-03 09:41:12,816 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", - "2021-12-03 09:41:13,574 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 2000000 vocabulary: 6877 MB (6 GB)\n", - "2021-12-03 09:41:13,575 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", - "2021-12-03 09:41:13,591 : MainThread : INFO : pre-computing uSIF weights for 2000000 words\n", - "2021-12-03 09:41:18,512 : MainThread : INFO : begin training\n", - "2021-12-03 09:41:18,980 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-03 09:41:19,312 : MainThread : INFO : computing 5 principal components took 0s\n", - "2021-12-03 09:41:19,317 : MainThread : INFO : removing 5 principal components took 0s\n", - "2021-12-03 09:41:19,318 : MainThread : INFO : training on 2758 effective sentences with 27528 effective words took 0s with 5884 sentences/s\n" + "2022-04-10 21:16:47,738 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:16:48,060 : MainThread : INFO : finished scanning 2758 sentences with an average length of 9 and 27528 total words\n", + "2022-04-10 21:16:49,907 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 2000000 vocabulary: 6877 MB (6 GB)\n", + "2022-04-10 21:16:49,909 : MainThread : INFO : initializing sentence vectors for 2758 sentences\n", + "2022-04-10 21:16:49,931 : MainThread : INFO : pre-computing uSIF weights for 2000000 words\n", + "2022-04-10 21:16:56,856 : MainThread : INFO : begin training\n", + "2022-04-10 21:16:57,322 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:16:57,374 : MainThread : INFO : computing 5 principal components took 0s\n", + "2022-04-10 21:16:57,384 : MainThread : INFO : removing 5 principal components took 0s\n", + "2022-04-10 21:16:57,386 : MainThread : INFO : training on 2758 effective sentences with 27528 effective words took 0s with 5915 sentences/s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "uSIF length 69.55\n" + "uSIF length 69.40\n" ] } ], @@ -1422,7 +3312,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -1465,28 +3355,28 @@ " uSIF\n", " paranmt-300\n", " length=11\n", - " 79.02\n", + " 79.00\n", " \n", " \n", " 27\n", " SIF-10\n", " paranmt-300\n", " components=10\n", - " 76.76\n", + " 76.72\n", " \n", " \n", " 25\n", " SIF-10\n", " paragram-300-sl999\n", " components=10\n", - " 74.27\n", + " 74.21\n", " \n", " \n", " 26\n", " SIF-10\n", " paragram-300-ws353\n", " components=10\n", - " 74.08\n", + " 74.03\n", " \n", " \n", "\n", @@ -1495,13 +3385,13 @@ "text/plain": [ " algo vecs params score\n", "12 CBOW paranmt-300 79.82\n", - "42 uSIF paranmt-300 length=11 79.02\n", - "27 SIF-10 paranmt-300 components=10 76.76\n", - "25 SIF-10 paragram-300-sl999 components=10 74.27\n", - "26 SIF-10 paragram-300-ws353 components=10 74.08" + "42 uSIF paranmt-300 length=11 79.00\n", + "27 SIF-10 paranmt-300 components=10 76.72\n", + "25 SIF-10 paragram-300-sl999 components=10 74.21\n", + "26 SIF-10 paragram-300-ws353 components=10 74.03" ] }, - "execution_count": 29, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1526,7 +3416,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -1534,38 +3424,38 @@ "output_type": "stream", "text": [ "`CBOW` | `paranmt-300` | | 79.82\n", - "`uSIF` | `paranmt-300` | length=11 | 79.02\n", - "`SIF-10` | `paranmt-300` | components=10 | 76.76\n", - "`SIF-10` | `paragram-300-sl999` | components=10 | 74.27\n", - "`SIF-10` | `paragram-300-ws353` | components=10 | 74.08\n", - "`SIF-10` | `fasttext-crawl-subwords-300` | components=10 | 73.54\n", - "`uSIF` | `paragram-300-sl999` | length=11 | 73.09\n", - "`SIF-10` | `fasttext-wiki-news-subwords-300` | components=10 | 72.24\n", - "`uSIF` | `paragram-300-ws353` | length=11 | 71.90\n", - "`SIF-10` | `glove-twitter-200` | components=10 | 71.67\n", - "`SIF-10` | `glove-wiki-gigaword-300` | components=10 | 71.43\n", - "`SIF-10` | `word2vec-google-news-300` | components=10 | 71.17\n", - "`SIF-10` | `glove-wiki-gigaword-200` | components=10 | 70.73\n", - "`SIF-10` | `glove-twitter-100` | components=10 | 69.70\n", - "`uSIF` | `fasttext-crawl-subwords-300` | length=11 | 69.55\n", - "`uSIF` | `fasttext-wiki-news-subwords-300` | length=11 | 69.05\n", - "`SIF-10` | `glove-wiki-gigaword-100` | components=10 | 68.43\n", - "`uSIF` | `glove-wiki-gigaword-300` | length=11 | 67.73\n", - "`uSIF` | `glove-wiki-gigaword-200` | length=11 | 67.26\n", - "`uSIF` | `word2vec-google-news-300` | length=11 | 67.15\n", - "`uSIF` | `glove-twitter-200` | length=11 | 66.73\n", - "`SIF-10` | `glove-twitter-50` | components=10 | 65.57\n", - "`uSIF` | `glove-wiki-gigaword-100` | length=11 | 65.48\n", - "`uSIF` | `paragram-25` | length=11 | 64.31\n", - "`uSIF` | `glove-twitter-100` | length=11 | 64.22\n", - "`SIF-10` | `glove-wiki-gigaword-50` | components=10 | 64.20\n", - "`uSIF` | `glove-wiki-gigaword-50` | length=11 | 62.22\n", + "`uSIF` | `paranmt-300` | length=11 | 79.00\n", + "`SIF-10` | `paranmt-300` | components=10 | 76.72\n", + "`SIF-10` | `paragram-300-sl999` | components=10 | 74.21\n", + "`SIF-10` | `paragram-300-ws353` | components=10 | 74.03\n", + "`SIF-10` | `fasttext-crawl-subwords-300` | components=10 | 73.38\n", + "`uSIF` | `paragram-300-sl999` | length=11 | 73.04\n", + "`SIF-10` | `fasttext-wiki-news-subwords-300` | components=10 | 72.29\n", + "`uSIF` | `paragram-300-ws353` | length=11 | 71.84\n", + "`SIF-10` | `glove-twitter-200` | components=10 | 71.62\n", + "`SIF-10` | `glove-wiki-gigaword-300` | components=10 | 71.35\n", + "`SIF-10` | `word2vec-google-news-300` | components=10 | 71.12\n", + "`SIF-10` | `glove-wiki-gigaword-200` | components=10 | 70.62\n", + "`SIF-10` | `glove-twitter-100` | components=10 | 69.65\n", + "`uSIF` | `fasttext-crawl-subwords-300` | length=11 | 69.40\n", + "`uSIF` | `fasttext-wiki-news-subwords-300` | length=11 | 68.63\n", + "`SIF-10` | `glove-wiki-gigaword-100` | components=10 | 68.34\n", + "`uSIF` | `glove-wiki-gigaword-300` | length=11 | 67.60\n", + "`uSIF` | `glove-wiki-gigaword-200` | length=11 | 67.11\n", + "`uSIF` | `word2vec-google-news-300` | length=11 | 66.99\n", + "`uSIF` | `glove-twitter-200` | length=11 | 66.67\n", + "`SIF-10` | `glove-twitter-50` | components=10 | 65.52\n", + "`uSIF` | `glove-wiki-gigaword-100` | length=11 | 65.33\n", + "`uSIF` | `paragram-25` | length=11 | 64.22\n", + "`uSIF` | `glove-twitter-100` | length=11 | 64.13\n", + "`SIF-10` | `glove-wiki-gigaword-50` | components=10 | 64.11\n", + "`uSIF` | `glove-wiki-gigaword-50` | length=11 | 62.06\n", "`CBOW` | `word2vec-google-news-300` | | 61.54\n", - "`uSIF` | `glove-twitter-50` | length=11 | 60.50\n", - "`SIF-10` | `paragram-25` | components=10 | 59.22\n", - "`uSIF` | `glove-twitter-25` | length=11 | 55.17\n", + "`uSIF` | `glove-twitter-50` | length=11 | 60.41\n", + "`SIF-10` | `paragram-25` | components=10 | 59.07\n", + "`uSIF` | `glove-twitter-25` | length=11 | 55.06\n", "`CBOW` | `paragram-300-ws353` | | 54.72\n", - "`SIF-10` | `glove-twitter-25` | components=10 | 54.42\n", + "`SIF-10` | `glove-twitter-25` | components=10 | 54.16\n", "`CBOW` | `paragram-300-sl999` | | 51.46\n", "`CBOW` | `fasttext-crawl-subwords-300` | | 48.49\n", "`CBOW` | `glove-wiki-gigaword-300` | | 44.46\n", diff --git a/notebooks/Speed Comparision.ipynb b/notebooks/Speed Comparision.ipynb index df10254..a41e7b8 100644 --- a/notebooks/Speed Comparision.ipynb +++ b/notebooks/Speed Comparision.ipynb @@ -96,7 +96,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "128 µs ± 8.96 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" + "119 µs ± 1.14 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" ] } ], @@ -114,7 +114,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "316 ms ± 17.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "1.18 s ± 37.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -151,7 +151,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "32.4 ms ± 350 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "35.7 ms ± 1.48 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -169,7 +169,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "1.61 ms ± 55.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" + "1.93 ms ± 26.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" ] } ], @@ -238,7 +238,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "38 ms ± 982 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "36.8 ms ± 554 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -256,7 +256,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "2.25 ms ± 31.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + "2.54 ms ± 44.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], diff --git a/notebooks/Tutorial.ipynb b/notebooks/Tutorial.ipynb index 6443f7d..6460fc6 100644 --- a/notebooks/Tutorial.ipynb +++ b/notebooks/Tutorial.ipynb @@ -25,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -35,9 +35,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Hello', 'world']\n", + "0\n" + ] + } + ], "source": [ "s = ([\"Hello\", \"world\"], 0)\n", "print(s[0])\n", @@ -301,10 +310,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-12-02 20:28:02,197 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse__glove-wiki-gigaword-100.3282d5e7c5e979c2411ba9703d63a46243a2047e/glove-wiki-gigaword-100.model\n", - "2021-12-02 20:28:03,181 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse__glove-wiki-gigaword-100.3282d5e7c5e979c2411ba9703d63a46243a2047e/glove-wiki-gigaword-100.model.vectors.npy with mmap=None\n", - "2021-12-02 20:28:03,249 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", - "2021-12-02 20:28:03,250 : MainThread : INFO : loaded /home/oborchers/.cache/huggingface/hub/fse__glove-wiki-gigaword-100.3282d5e7c5e979c2411ba9703d63a46243a2047e/glove-wiki-gigaword-100.model\n" + "2022-04-10 21:00:42,265 : MainThread : INFO : Lock 23311648346896 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-100.main.3282d5e7c5e979c2411ba9703d63a46243a2047e/glove-wiki-gigaword-100.model.lock\n", + "2022-04-10 21:00:42,267 : MainThread : INFO : Lock 23311648346896 released on /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-100.main.3282d5e7c5e979c2411ba9703d63a46243a2047e/glove-wiki-gigaword-100.model.lock\n", + "2022-04-10 21:00:47,947 : MainThread : INFO : Lock 23311648386016 acquired on /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-100.main.3282d5e7c5e979c2411ba9703d63a46243a2047e/glove-wiki-gigaword-100.model.vectors.npy.lock\n", + "2022-04-10 21:00:47,949 : MainThread : INFO : Lock 23311648386016 released on /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-100.main.3282d5e7c5e979c2411ba9703d63a46243a2047e/glove-wiki-gigaword-100.model.vectors.npy.lock\n", + "2022-04-10 21:00:47,951 : MainThread : INFO : loading Vectors object from /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-100.main.3282d5e7c5e979c2411ba9703d63a46243a2047e/glove-wiki-gigaword-100.model\n", + "2022-04-10 21:00:48,915 : MainThread : INFO : loading vectors from /home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-100.main.3282d5e7c5e979c2411ba9703d63a46243a2047e/glove-wiki-gigaword-100.model.vectors.npy with mmap=None\n", + "2022-04-10 21:00:48,970 : MainThread : INFO : setting ignored attribute vectors_norm to None\n", + "2022-04-10 21:00:51,758 : MainThread : INFO : KeyedVectors lifecycle event {'fname': '/home/oborchers/.cache/huggingface/hub/fse--glove-wiki-gigaword-100.main.3282d5e7c5e979c2411ba9703d63a46243a2047e/glove-wiki-gigaword-100.model', 'datetime': '2022-04-10T21:00:51.731098', 'gensim': '4.0.0', 'python': '3.8.5 (default, Sep 4 2020, 07:30:14) \\n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-173-generic-x86_64-with-glibc2.10', 'event': 'loaded'}\n" ] } ], @@ -369,14 +382,14 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-02 20:29:33,501 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" + "2022-04-10 21:01:20,427 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en\n" ] } ], @@ -387,30 +400,31 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2021-12-02 20:29:33,926 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-02 20:29:38,928 : MainThread : INFO : SCANNING : finished 3898235 sentences with 43084793 words\n", - "2021-12-02 20:29:42,336 : MainThread : WARNING : found 16 empty sentences\n", - "2021-12-02 20:29:42,337 : MainThread : INFO : finished scanning 6468640 sentences with an average length of 11 and 71556728 total words\n", - "2021-12-02 20:29:42,467 : MainThread : INFO : estimated memory for 6468640 sentences with 100 dimensions and 400000 vocabulary: 2621 MB (2 GB)\n", - "2021-12-02 20:29:42,468 : MainThread : INFO : initializing sentence vectors for 6468640 sentences\n", - "2021-12-02 20:30:01,833 : MainThread : INFO : pre-computing uSIF weights for 400000 words\n", - "2021-12-02 20:30:02,752 : MainThread : INFO : begin training\n", - "2021-12-02 20:30:07,761 : MainThread : INFO : PROGRESS : finished 25.71% with 1663049 sentences and 12641690 words, 332609 sentences/s\n", - "2021-12-02 20:30:12,763 : MainThread : INFO : PROGRESS : finished 49.99% with 3233385 sentences and 24604424 words, 314067 sentences/s\n", - "2021-12-02 20:30:17,765 : MainThread : INFO : PROGRESS : finished 74.09% with 4792913 sentences and 36482181 words, 311905 sentences/s\n", - "2021-12-02 20:30:22,767 : MainThread : INFO : PROGRESS : finished 98.63% with 6380016 sentences and 48580744 words, 317420 sentences/s\n", - "2021-12-02 20:30:23,043 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2021-12-02 20:30:23,044 : MainThread : INFO : sampling 2684354 vectors to compute principal components\n", - "2021-12-02 20:30:28,301 : MainThread : INFO : computing 5 principal components took 5s\n", - "2021-12-02 20:30:30,143 : MainThread : INFO : removing 5 principal components took 1s\n", - "2021-12-02 20:30:30,144 : MainThread : INFO : training on 6468624 effective sentences with 49255184 effective words took 20s with 318779 sentences/s\n" + "2022-04-10 21:01:21,811 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:01:26,812 : MainThread : INFO : SCANNING : finished 4268552 sentences with 47204452 words\n", + "2022-04-10 21:01:29,815 : MainThread : WARNING : found 16 empty sentences\n", + "2022-04-10 21:01:29,816 : MainThread : INFO : finished scanning 6468640 sentences with an average length of 11 and 71556728 total words\n", + "2022-04-10 21:01:30,127 : MainThread : INFO : estimated memory for 6468640 sentences with 100 dimensions and 400000 vocabulary: 2621 MB (2 GB)\n", + "2022-04-10 21:01:30,129 : MainThread : INFO : initializing sentence vectors for 6468640 sentences\n", + "2022-04-10 21:01:58,270 : MainThread : INFO : pre-computing uSIF weights for 400000 words\n", + "2022-04-10 21:01:59,606 : MainThread : INFO : begin training\n", + "2022-04-10 21:02:04,625 : MainThread : INFO : PROGRESS : finished 20.57% with 1330528 sentences and 10112720 words, 266105 sentences/s\n", + "2022-04-10 21:02:09,626 : MainThread : INFO : PROGRESS : finished 40.11% with 2594574 sentences and 19736065 words, 252809 sentences/s\n", + "2022-04-10 21:02:14,629 : MainThread : INFO : PROGRESS : finished 59.45% with 3845556 sentences and 29254320 words, 250196 sentences/s\n", + "2022-04-10 21:02:19,631 : MainThread : INFO : PROGRESS : finished 78.39% with 5070852 sentences and 38592790 words, 245059 sentences/s\n", + "2022-04-10 21:02:24,632 : MainThread : INFO : PROGRESS : finished 96.63% with 6250702 sentences and 47579860 words, 235970 sentences/s\n", + "2022-04-10 21:02:25,623 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-04-10 21:02:25,624 : MainThread : INFO : sampling 2684354 vectors to compute principal components\n", + "2022-04-10 21:02:37,478 : MainThread : INFO : computing 5 principal components took 11s\n", + "2022-04-10 21:02:44,538 : MainThread : INFO : removing 5 principal components took 7s\n", + "2022-04-10 21:02:44,539 : MainThread : INFO : training on 6468624 effective sentences with 49255184 effective words took 26s with 248624 sentences/s\n" ] }, { @@ -419,7 +433,7 @@ "(6468624, 49255184)" ] }, - "execution_count": 24, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -451,48 +465,48 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-12-02 20:29:27,731 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-02 20:29:27,732 : MainThread : INFO : finished scanning 1 sentences with an average length of 3 and 3 total words\n", - "2021-12-02 20:29:27,733 : MainThread : INFO : removing 5 principal components took 0s\n" + "2022-04-10 21:02:44,544 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:02:44,545 : MainThread : INFO : finished scanning 1 sentences with an average length of 3 and 3 total words\n", + "2022-04-10 21:02:44,546 : MainThread : INFO : removing 5 principal components took 0s\n" ] }, { "data": { "text/plain": [ - "array([[ 2.52946198e-01, -2.80404240e-02, 2.69833803e-02,\n", - " -2.78671950e-01, -7.44080096e-02, 4.57280308e-01,\n", - " -1.05054319e-01, 2.72667259e-02, -6.48381487e-02,\n", - " -3.40230405e-01, -2.04274803e-03, -7.25736842e-02,\n", - " 1.93554670e-01, 1.53935701e-01, -1.17377929e-01,\n", - " -2.86470026e-01, 9.35275406e-02, -1.55883789e-01,\n", - " -3.67838562e-01, 3.55114430e-01, -1.01716474e-01,\n", - " 2.67178684e-01, -3.58482040e-02, -1.73439160e-01,\n", - " 1.11153685e-01, 9.17388499e-02, -2.18827292e-01,\n", - " -5.82419336e-02, 4.64093864e-01, 1.16017178e-01,\n", - " 2.43311703e-01, 2.93871671e-01, 3.83903325e-01,\n", - " 1.23666152e-01, 1.68591365e-03, 2.47326195e-01,\n", - " 1.76458687e-01, 6.19876608e-02, 2.72473156e-01,\n", - " -1.29384965e-01, -1.28560305e-01, 1.32312194e-01,\n", - " 2.21162975e-01, -1.13845311e-01, -1.39296561e-01,\n", - " -1.14041977e-01, -4.00316596e-01, 3.18139911e-01,\n", - " 3.94160390e-01, -1.03439599e-01, -1.09797075e-01,\n", - " -2.17508301e-01, 8.62751678e-02, 1.37668043e-01,\n", - " 9.09228399e-02, -4.50117111e-01, 2.61651352e-04,\n", - " 2.41029952e-02, -9.62151289e-02, 1.86967283e-01,\n", - " 5.71158007e-02, 6.35597467e-01, 3.55606563e-02,\n", - " -1.72205180e-01, 5.56394160e-02, -8.17311108e-02,\n", - " 2.58098125e-01, 2.58260906e-01, 2.35461563e-01,\n", - " -1.40228420e-02, -1.79286793e-01, 1.29779249e-01,\n", - " 4.51383963e-02, -4.99045491e-01, -2.24621549e-01,\n", - " 4.01714832e-01, 3.15032780e-01, -2.19353005e-01,\n", - " -4.74357456e-02, 8.13784525e-02, -4.94770855e-02,\n", - " 4.28712875e-01, 3.55663359e-01, -2.26212099e-01,\n", - " -5.20673275e-01, -1.03911251e-01, -3.50226879e-01,\n", - " 1.44183502e-01, -2.71161199e-01, -2.69273341e-01,\n", - " -7.84424692e-02, 6.86244369e-02, 5.69853187e-01,\n", - " -2.65402459e-02, -4.07446802e-01, -1.45264074e-01,\n", - " -2.33762369e-01, 1.61456198e-01, -7.20783919e-02,\n", - " 2.83768862e-01]], dtype=float32)" + "array([[ 2.58816600e-01, -2.95302048e-02, 2.75351852e-02,\n", + " -2.84114152e-01, -7.67392814e-02, 4.69468027e-01,\n", + " -1.08480439e-01, 2.74309274e-02, -6.55729175e-02,\n", + " -3.49144369e-01, -3.67706642e-03, -7.37217665e-02,\n", + " 1.99556813e-01, 1.58444345e-01, -1.19665325e-01,\n", + " -2.94139415e-01, 9.47405249e-02, -1.60569757e-01,\n", + " -3.78268361e-01, 3.62190604e-01, -1.04472280e-01,\n", + " 2.73262203e-01, -3.63826789e-02, -1.77271560e-01,\n", + " 1.13316491e-01, 9.37681645e-02, -2.24020645e-01,\n", + " -5.91988862e-02, 4.76207793e-01, 1.19424753e-01,\n", + " 2.50848651e-01, 3.00246894e-01, 3.93525660e-01,\n", + " 1.26845583e-01, 9.48224217e-04, 2.53242493e-01,\n", + " 1.82835817e-01, 6.30666614e-02, 2.79401392e-01,\n", + " -1.32174164e-01, -1.33375764e-01, 1.35822147e-01,\n", + " 2.27280006e-01, -1.15767486e-01, -1.42402038e-01,\n", + " -1.17349595e-01, -4.09546375e-01, 3.27205539e-01,\n", + " 4.02800381e-01, -1.04363769e-01, -1.11321002e-01,\n", + " -2.22490475e-01, 8.87305886e-02, 1.40741229e-01,\n", + " 9.31955799e-02, -4.58287299e-01, 5.18307090e-04,\n", + " 2.35566050e-02, -1.00434214e-01, 1.92349762e-01,\n", + " 5.87247610e-02, 6.50634110e-01, 3.70587595e-02,\n", + " -1.77234858e-01, 5.73643446e-02, -8.31981450e-02,\n", + " 2.63155580e-01, 2.63526857e-01, 2.41669610e-01,\n", + " -1.44437030e-02, -1.83978081e-01, 1.32746503e-01,\n", + " 4.74075712e-02, -5.11893451e-01, -2.29704395e-01,\n", + " 4.10661429e-01, 3.22975338e-01, -2.24843532e-01,\n", + " -4.89681363e-02, 8.36608559e-02, -5.19971400e-02,\n", + " 4.39288676e-01, 3.65279317e-01, -2.31688872e-01,\n", + " -5.32944202e-01, -1.05634883e-01, -3.59344721e-01,\n", + " 1.47738606e-01, -2.77038217e-01, -2.75986940e-01,\n", + " -8.08256269e-02, 7.00389221e-02, 5.83384752e-01,\n", + " -2.55951770e-02, -4.16631788e-01, -1.48772165e-01,\n", + " -2.40149587e-01, 1.64508954e-01, -7.53749311e-02,\n", + " 2.90106297e-01]], dtype=float32)" ] }, "execution_count": 15, @@ -527,26 +541,26 @@ { "data": { "text/plain": [ - "array([ 0.06334692, -0.00278309, 0.02876258, 0.2938737 , 0.16536492,\n", - " -0.32892653, -0.24968779, -0.11547095, -0.00762739, -0.09775834,\n", - " -0.02934675, 0.11205705, -0.06664 , -0.26486415, -0.1903032 ,\n", - " -0.05020472, -0.00186126, 0.06867541, 0.02295774, 0.15203542,\n", - " 0.09067672, 0.04975739, -0.23175132, 0.14476334, -0.14295411,\n", - " 0.02923434, 0.04803507, 0.06715866, -0.07600797, 0.01031642,\n", - " -0.2484782 , 0.22390996, -0.09542373, -0.09283138, 0.13540202,\n", - " 0.15456603, 0.19957334, -0.10639023, -0.09370194, -0.21725996,\n", - " -0.0491615 , -0.07300739, 0.03414775, -0.09599279, -0.24818763,\n", - " 0.1342045 , -0.23917073, 0.05558453, -0.06525436, -0.48910773,\n", - " -0.22362332, -0.00779874, -0.03814342, 0.2980885 , -0.17636092,\n", - " -0.5499361 , -0.14905512, -0.03137571, 0.67050046, -0.07416987,\n", - " 0.0496444 , -0.18189807, -0.14830717, -0.00139662, 0.05445424,\n", - " 0.14017463, -0.19543567, 0.214339 , 0.12590402, -0.07116839,\n", - " 0.08139852, -0.06057443, -0.2506972 , -0.30141208, -0.13040717,\n", - " -0.16307008, -0.17811869, 0.08592579, -0.10485128, -0.19760096,\n", - " 0.21615295, 0.23217016, -0.07084101, -0.08753166, 0.00301737,\n", - " -0.03807069, 0.05616122, -0.11818701, -0.03473554, -0.1218198 ,\n", - " -0.01102988, 0.03009659, 0.02223159, -0.31992042, -0.26482922,\n", - " 0.10421025, 0.23144925, -0.21129477, 0.1534869 , 0.2635986 ],\n", + "array([ 0.06577122, 0.00125362, 0.02861076, 0.29774222, 0.16603038,\n", + " -0.3326527 , -0.25274056, -0.11763255, -0.00691745, -0.09893274,\n", + " -0.03162983, 0.11619435, -0.06521351, -0.2655796 , -0.19060446,\n", + " -0.0539908 , -0.00769202, 0.06885006, 0.02343568, 0.1532896 ,\n", + " 0.09342796, 0.04696935, -0.23081023, 0.1498743 , -0.14505012,\n", + " 0.02421208, 0.05296347, 0.06859872, -0.07599732, 0.00725658,\n", + " -0.24535023, 0.22700995, -0.09825065, -0.09442319, 0.13708887,\n", + " 0.15521362, 0.20619243, -0.10562573, -0.09422812, -0.21555066,\n", + " -0.04667541, -0.0792224 , 0.03256182, -0.09426205, -0.24983093,\n", + " 0.13597786, -0.2398801 , 0.05768704, -0.06787113, -0.4941577 ,\n", + " -0.22496416, -0.00669573, -0.03549183, 0.29734784, -0.17858498,\n", + " -0.5486123 , -0.14864792, -0.03553995, 0.67317814, -0.076203 ,\n", + " 0.05146242, -0.18583792, -0.15082437, -0.00156193, 0.05055934,\n", + " 0.14105804, -0.19593558, 0.21223646, 0.12954028, -0.07022676,\n", + " 0.07883358, -0.06120036, -0.2503627 , -0.3017727 , -0.13266395,\n", + " -0.16572367, -0.18174419, 0.08304436, -0.11019304, -0.19390059,\n", + " 0.21516724, 0.2340886 , -0.06900662, -0.0862717 , 0.00340593,\n", + " -0.03866918, 0.05421902, -0.11847197, -0.03135929, -0.11774423,\n", + " -0.01175743, 0.03097493, 0.02155077, -0.32340154, -0.26865822,\n", + " 0.10741439, 0.23262182, -0.21423306, 0.15430337, 0.26367036],\n", " dtype=float32)" ] }, @@ -618,22 +632,22 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-12-02 20:29:27,773 : MainThread : INFO : precomputing L2-norms of sentence vectors\n" + "2022-04-10 21:02:46,966 : MainThread : INFO : precomputing L2-norms of sentence vectors\n" ] }, { "data": { "text/plain": [ - "[(2688924, 1.0),\n", - " (3116047, 1.0),\n", - " (2688918, 1.0),\n", - " (2688920, 1.0),\n", - " (2688922, 1.0),\n", - " (2688914, 1.0),\n", - " (2688926, 1.0),\n", - " (3116041, 1.0),\n", - " (3116043, 1.0),\n", - " (1384926, 1.0)]" + "[(5742433, 1.0),\n", + " (5172303, 1.0),\n", + " (1657838, 1.0),\n", + " (5742447, 1.0),\n", + " (5742445, 1.0),\n", + " (5742443, 1.0),\n", + " (5742441, 1.0),\n", + " (5742439, 1.0),\n", + " (5742437, 1.0),\n", + " (5742435, 1.0)]" ] }, "execution_count": 19, @@ -661,16 +675,16 @@ { "data": { "text/plain": [ - "[(['Will', 'Facebook', 'buy', 'Quora?'], 2688924, 1.0),\n", - " (['Why', \"doesn't\", 'Apple', 'buy', 'Samsung?'], 3116047, 1.0),\n", - " (['Will', 'Facebook', 'buy', 'Quora?'], 2688918, 1.0),\n", - " (['Will', 'Facebook', 'buy', 'Quora?'], 2688920, 1.0),\n", - " (['Will', 'Facebook', 'buy', 'Quora?'], 2688922, 1.0),\n", - " (['Will', 'Facebook', 'buy', 'Quora?'], 2688914, 1.0),\n", - " (['Will', 'Facebook', 'buy', 'Quora?'], 2688926, 1.0),\n", - " (['Why', \"doesn't\", 'Apple', 'buy', 'Samsung?'], 3116041, 1.0),\n", - " (['Why', \"doesn't\", 'Apple', 'buy', 'Samsung?'], 3116043, 1.0),\n", - " (['Should', 'I', 'buy', 'CS:GO?'], 1384926, 1.0)]" + "[(['Should', 'I', 'buy', 'Google', 'Glass?'], 5742433, 1.0),\n", + " (['Why', \"doesn't\", 'Apple', 'buy', 'Samsung?'], 5172303, 1.0),\n", + " (['Should', 'I', 'buy', 'Moto', 'G4', 'Plus?'], 1657838, 1.0),\n", + " (['Should', 'I', 'buy', 'Google', 'Glass?'], 5742447, 1.0),\n", + " (['Should', 'I', 'buy', 'Google', 'Glass?'], 5742445, 1.0),\n", + " (['Should', 'I', 'buy', 'Google', 'Glass?'], 5742443, 1.0),\n", + " (['Should', 'I', 'buy', 'Google', 'Glass?'], 5742441, 1.0),\n", + " (['Should', 'I', 'buy', 'Google', 'Glass?'], 5742439, 1.0),\n", + " (['Should', 'I', 'buy', 'Google', 'Glass?'], 5742437, 1.0),\n", + " (['Should', 'I', 'buy', 'Google', 'Glass?'], 5742435, 1.0)]" ] }, "execution_count": 20, @@ -715,8 +729,8 @@ " 'Rails',\n", " 'or',\n", " 'Python/Django?'],\n", - " 4717071,\n", - " 0.9476152658462524),\n", + " 4717063,\n", + " 0.9466769099235535),\n", " (['Which',\n", " 'is',\n", " 'more',\n", @@ -728,8 +742,8 @@ " 'Rails',\n", " 'or',\n", " 'Python/Django?'],\n", - " 4717059,\n", - " 0.9476152658462524),\n", + " 4717065,\n", + " 0.9466769099235535),\n", " (['Which',\n", " 'is',\n", " 'more',\n", @@ -742,7 +756,7 @@ " 'or',\n", " 'Python/Django?'],\n", " 4717061,\n", - " 0.9476152658462524),\n", + " 0.9466769099235535),\n", " (['Which',\n", " 'is',\n", " 'more',\n", @@ -754,8 +768,8 @@ " 'Rails',\n", " 'or',\n", " 'Python/Django?'],\n", - " 4717063,\n", - " 0.9476152658462524),\n", + " 4717057,\n", + " 0.9466769099235535),\n", " (['Which',\n", " 'is',\n", " 'more',\n", @@ -767,8 +781,8 @@ " 'Rails',\n", " 'or',\n", " 'Python/Django?'],\n", - " 4717065,\n", - " 0.9476152658462524),\n", + " 4717067,\n", + " 0.9466769099235535),\n", " (['Which',\n", " 'is',\n", " 'more',\n", @@ -780,8 +794,8 @@ " 'Rails',\n", " 'or',\n", " 'Python/Django?'],\n", - " 4717067,\n", - " 0.9476152658462524),\n", + " 4717069,\n", + " 0.9466769099235535),\n", " (['Which',\n", " 'is',\n", " 'more',\n", @@ -793,8 +807,8 @@ " 'Rails',\n", " 'or',\n", " 'Python/Django?'],\n", - " 4717069,\n", - " 0.9476152658462524),\n", + " 4717059,\n", + " 0.9466769099235535),\n", " (['Which',\n", " 'is',\n", " 'more',\n", @@ -806,14 +820,14 @@ " 'Rails',\n", " 'or',\n", " 'Python/Django?'],\n", - " 4717057,\n", - " 0.9476152658462524),\n", + " 4717071,\n", + " 0.9466769099235535),\n", " (['How', 'can', 'I', 'make', 'some', 'easy', 'money?'],\n", - " 6461300,\n", - " 0.9442180395126343),\n", + " 5443129,\n", + " 0.9436649680137634),\n", " (['How', 'can', 'I', 'make', 'some', 'easy', 'money?'],\n", - " 6461302,\n", - " 0.9442180395126343)]" + " 5443135,\n", + " 0.9436649680137634)]" ] }, "execution_count": 21, @@ -841,44 +855,44 @@ "name": "stderr", "output_type": "stream", "text": [ - "2021-12-02 20:29:31,107 : MainThread : INFO : scanning all indexed sentences and their word counts\n", - "2021-12-02 20:29:31,109 : MainThread : INFO : finished scanning 1 sentences with an average length of 6 and 6 total words\n", - "2021-12-02 20:29:31,110 : MainThread : INFO : removing 5 principal components took 0s\n" + "2022-04-10 21:02:52,537 : MainThread : INFO : scanning all indexed sentences and their word counts\n", + "2022-04-10 21:02:52,537 : MainThread : INFO : finished scanning 1 sentences with an average length of 6 and 6 total words\n", + "2022-04-10 21:02:52,538 : MainThread : INFO : removing 5 principal components took 0s\n" ] }, { "data": { "text/plain": [ "[(['How', 'do', 'I', 'learn', 'Python', 'in', 'easy', 'way?'],\n", - " 6255670,\n", - " 0.9872919321060181),\n", + " 418230,\n", + " 0.9860147833824158),\n", " (['How', 'do', 'I', 'learn', 'Python', 'in', 'easy', 'way?'],\n", - " 6255672,\n", - " 0.9872919321060181),\n", + " 418232,\n", + " 0.9860147833824158),\n", " (['How', 'do', 'I', 'learn', 'Python', 'in', 'easy', 'way?'],\n", - " 418226,\n", - " 0.9872919321060181),\n", + " 418236,\n", + " 0.9860147833824158),\n", " (['How', 'do', 'I', 'learn', 'Python', 'in', 'easy', 'way?'],\n", - " 418224,\n", - " 0.9872919321060181),\n", + " 6255670,\n", + " 0.9860147833824158),\n", " (['How', 'do', 'I', 'learn', 'Python', 'in', 'easy', 'way?'],\n", - " 418232,\n", - " 0.9872919321060181),\n", + " 418226,\n", + " 0.9860147833824158),\n", " (['How', 'do', 'I', 'learn', 'Python', 'in', 'easy', 'way?'],\n", - " 6255678,\n", - " 0.9872919321060181),\n", + " 418228,\n", + " 0.9860147833824158),\n", " (['How', 'do', 'I', 'learn', 'Python', 'in', 'easy', 'way?'],\n", - " 6255676,\n", - " 0.9872919321060181),\n", + " 418238,\n", + " 0.9860147833824158),\n", " (['How', 'do', 'I', 'learn', 'Python', 'in', 'easy', 'way?'],\n", - " 6255674,\n", - " 0.9872919321060181),\n", + " 6255666,\n", + " 0.9860147833824158),\n", " (['How', 'do', 'I', 'learn', 'Python', 'in', 'easy', 'way?'],\n", - " 418236,\n", - " 0.9872919321060181),\n", + " 418234,\n", + " 0.9860147833824158),\n", " (['How', 'do', 'I', 'learn', 'Python', 'in', 'easy', 'way?'],\n", - " 418228,\n", - " 0.9872919321060181)]" + " 6255674,\n", + " 0.9860147833824158)]" ] }, "execution_count": 22, diff --git a/release.sh b/release.sh index c545bd6..6c4144e 100644 --- a/release.sh +++ b/release.sh @@ -3,4 +3,6 @@ docformatter --in-place **/*.py --wrap-summaries 88 --wrap-descriptions 88 isort --atomic **/*.py black . -coverage run --source fse setup.py test \ No newline at end of file +pytest -v --cov=fse --cov-report=term-missing + +pdoc --html --output-dir docs --force fse \ No newline at end of file diff --git a/setup.py b/setup.py index d1dc663..52ce3d5 100755 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ from setuptools.command.build_ext import build_ext NAME = "fse" -VERSION = "0.2.0" +VERSION = "1.0.0" DESCRIPTION = "Fast Sentence Embeddings for Gensim" AUTHOR = "Oliver Borchers" AUTHOR_EMAIL = "o.borchers@oxolo.com" @@ -25,14 +25,14 @@ LICENSE = "GPL-3.0" REQUIRES_PYTHON = ">=3.6" NUMPY_STR = "numpy >= 1.11.3" -CYTHON_STR = "Cython==0.29.14" +CYTHON_STR = "Cython==0.29.23" INSTALL_REQUIRES = [ NUMPY_STR, "scipy >= 0.18.1", "smart_open >= 1.5.0", "scikit-learn >= 0.19.1", - "gensim<4", + "gensim>=4", "wordfreq >= 2.2.1", "huggingface-hub", "psutil", diff --git a/test/test_average.py b/test/test_average.py index 4462415..7c6d249 100644 --- a/test/test_average.py +++ b/test/test_average.py @@ -21,8 +21,10 @@ TEST_DATA = Path(__file__).parent / "test_data" CORPUS = TEST_DATA / "test_sentences.txt" + + DIM = 5 -W2V = Word2Vec(min_count=1, size=DIM) +W2V = Word2Vec(min_count=1, vector_size=DIM) with open(CORPUS, "r") as file: SENTENCES = [l.split() for _, l in enumerate(file)] W2V.build_vocab(SENTENCES) @@ -64,9 +66,9 @@ def test_average_train_np_w2v(self): self.model, self.sentences, self.model.sv.vectors, mem ) self.assertEqual((4, 7), output) - self.assertTrue((183 == self.model.sv[0]).all()) - self.assertTrue((164.5 == self.model.sv[1]).all()) - self.assertTrue((self.model.wv.vocab["go"].index == self.model.sv[2]).all()) + self.assertTrue((179 == self.model.sv[0]).all()) + self.assertTrue((140.75 == self.model.sv[1]).all()) + self.assertTrue((self.model.wv.key_to_index["go"] == self.model.sv[2]).all()) def test_average_train_cy_w2v(self): self.model.sv.vectors = np.zeros_like(self.model.sv.vectors, dtype=np.float32) @@ -78,12 +80,12 @@ def test_average_train_cy_w2v(self): self.model, self.sentences, self.model.sv.vectors, mem ) self.assertEqual((4, 7), output) - self.assertTrue((183 == self.model.sv[0]).all()) - self.assertTrue((164.5 == self.model.sv[1]).all()) - self.assertTrue((self.model.wv.vocab["go"].index == self.model.sv[2]).all()) + self.assertTrue((179 == self.model.sv[0]).all()) + self.assertTrue((140.75 == self.model.sv[1]).all()) + self.assertTrue((self.model.wv.key_to_index["go"] == self.model.sv[2]).all()) def test_average_train_np_ft(self): - ft = FastText(min_count=1, size=DIM) + ft = FastText(min_count=1, vector_size=DIM) ft.build_vocab(SENTENCES) m = Average(ft) m.prep.prepare_vectors( @@ -103,7 +105,7 @@ def test_average_train_np_ft(self): # (2 + 1) / 2 = 1.5 def test_average_train_cy_ft(self): - ft = FastText(min_count=1, size=DIM) + ft = FastText(min_count=1, vector_size=DIM) ft.build_vocab(SENTENCES) m = Average(ft) m.prep.prepare_vectors( @@ -146,7 +148,7 @@ def test_cy_equal_np_w2v(self): self.assertTrue((m1.sv.vectors == m2.sv.vectors).all()) def test_cy_equal_np_w2v_random(self): - w2v = Word2Vec(min_count=1, size=DIM) + w2v = Word2Vec(min_count=1, vector_size=DIM) # Random initialization w2v.build_vocab(SENTENCES) @@ -172,7 +174,7 @@ def test_cy_equal_np_w2v_random(self): self.assertTrue(np.allclose(m1.sv.vectors, m2.sv.vectors, atol=1e-6)) def test_cy_equal_np_ft_random(self): - ft = FastText(size=20, min_count=1) + ft = FastText(vector_size=20, min_count=1) ft.build_vocab(SENTENCES) m1 = Average(ft) diff --git a/test/test_base_s2v.py b/test/test_base_s2v.py index 8e7d3a4..48f0986 100644 --- a/test/test_base_s2v.py +++ b/test/test_base_s2v.py @@ -12,7 +12,7 @@ import numpy as np from gensim.models import FastText, Word2Vec -from gensim.models.keyedvectors import BaseKeyedVectors +from gensim.models.keyedvectors import KeyedVectors from wordfreq import get_frequency_dict from fse.models.base_s2v import EPS, BaseSentence2VecModel, BaseSentence2VecPreparer @@ -22,7 +22,7 @@ TEST_DATA = Path(__file__).parent / "test_data" CORPUS = TEST_DATA / "test_sentences.txt" DIM = 5 -W2V = Word2Vec(min_count=1, size=DIM) +W2V = Word2Vec(min_count=1, vector_size=DIM) with open(CORPUS, "r") as file: SENTENCES = [l.split() for _, l in enumerate(file)] W2V.build_vocab(SENTENCES) @@ -35,18 +35,18 @@ def test_init_w_wrong_model(self): def test_init_w_empty_w2v_model(self): with self.assertRaises(RuntimeError): - w2v = Word2Vec() + w2v = Word2Vec(min_count=1, vector_size=DIM) del w2v.wv.vectors BaseSentence2VecModel(w2v) def test_init_w_empty_vocab_model(self): with self.assertRaises(RuntimeError): - w2v = Word2Vec() - del w2v.wv.vocab + w2v = Word2Vec(min_count=1, vector_size=DIM) + del w2v.wv BaseSentence2VecModel(w2v) def test_init_w_ft_model_wo_vecs(self): - ft = FastText(SENTENCES, size=5) + ft = FastText(SENTENCES, vector_size=5) with self.assertRaises(RuntimeError): ft.wv.vectors_vocab = None BaseSentence2VecModel(ft) @@ -55,26 +55,26 @@ def test_init_w_ft_model_wo_vecs(self): BaseSentence2VecModel(ft) def test_init_w_empty_ft_model(self): - ft = FastText(min_count=1, size=DIM) + ft = FastText(min_count=1, vector_size=DIM) ft.wv.vectors = np.zeros(10) ft.wv.vectors_ngrams = None with self.assertRaises(RuntimeError): BaseSentence2VecModel(ft) def test_init_w_incompatible_ft_model(self): - ft = FastText(min_count=1, size=DIM, compatible_hash=False) + ft = FastText(min_count=1, vector_size=DIM) with self.assertRaises(RuntimeError): BaseSentence2VecModel(ft) def test_include_model(self): se = BaseSentence2VecModel(W2V) - self.assertTrue(isinstance(se.wv, BaseKeyedVectors)) + self.assertTrue(isinstance(se.wv, KeyedVectors)) def test_model_w_language(self): se = BaseSentence2VecModel(W2V, lang_freq="en") freq = int((2 ** 31 - 1) * get_frequency_dict("en", wordlist="best")["help"]) - self.assertEqual(freq, se.wv.vocab["help"].count) - self.assertEqual(21, se.wv.vocab["79"].count) + self.assertEqual(freq, se.wv.get_vecattr("help", "count")) + self.assertEqual(21, se.wv.get_vecattr("79", "count")) def test_model_w_wrong_language(self): with self.assertRaises(ValueError): @@ -87,12 +87,12 @@ def test_save_load(self): self.assertTrue(p.exists()) se2 = BaseSentence2VecModel.load(str(p.absolute())) self.assertTrue((se.wv.vectors == se2.wv.vectors).all()) - self.assertEqual(se.wv.index2word, se2.wv.index2word) + self.assertEqual(se.wv.index_to_key, se2.wv.index_to_key) self.assertEqual(se.workers, se2.workers) p.unlink() def test_save_load_with_memmap(self): - ft = FastText(min_count=1, size=5) + ft = FastText(min_count=1, vector_size=5) ft.build_vocab(SENTENCES) shape = (1000, 1000) ft.wv.vectors = np.zeros(shape, np.float32) @@ -122,7 +122,7 @@ def test_save_load_with_memmap(self): p.unlink() def test_map_all_vectors_to_disk(self): - ft = FastText(min_count=1, size=5) + ft = FastText(min_count=1, vector_size=5) ft.build_vocab(SENTENCES) p = TEST_DATA / "test_emb" @@ -161,7 +161,7 @@ def test_scan_w_list(self): def test_str_rep(self): output = str(BaseSentence2VecModel(W2V)) self.assertEqual( - "BaseSentence2VecModel based on Word2VecKeyedVectors, size=0", output + "BaseSentence2VecModel based on KeyedVectors, vector_size=0", output ) def test_scan_w_ituple(self): @@ -216,7 +216,7 @@ def test_scan_w_many_to_one_input(self): self.assertEqual(1, output) def test_estimate_memory(self): - ft = FastText(min_count=1, size=5) + ft = FastText(min_count=1, vector_size=5) ft.build_vocab(SENTENCES) se = BaseSentence2VecModel(ft) self.assertEqual(2040025124, se.estimate_memory(int(1e8))["Total"]) @@ -247,7 +247,7 @@ def test_child_requirements(self): se._post_inference_calls() def test_check_pre_train_san_no_wv(self): - ft = FastText(min_count=1, size=5) + ft = FastText(min_count=1, vector_size=5) ft.build_vocab(SENTENCES) se = BaseSentence2VecModel(ft) se.wv = None @@ -255,7 +255,7 @@ def test_check_pre_train_san_no_wv(self): se._check_pre_training_sanity(1, 1, 1) def test_check_pre_train_san_no_wv_len(self): - ft = FastText(min_count=1, size=5) + ft = FastText(min_count=1, vector_size=5) ft.build_vocab(SENTENCES) se = BaseSentence2VecModel(ft) se.wv.vectors = [] @@ -263,7 +263,7 @@ def test_check_pre_train_san_no_wv_len(self): se._check_pre_training_sanity(1, 1, 1) def test_check_pre_train_san_no_ngrams_vectors(self): - ft = FastText(min_count=1, size=5) + ft = FastText(min_count=1, vector_size=5) ft.build_vocab(SENTENCES) se = BaseSentence2VecModel(ft) se.wv.vectors_ngrams = [] @@ -275,7 +275,7 @@ def test_check_pre_train_san_no_ngrams_vectors(self): se._check_pre_training_sanity(1, 1, 1) def test_check_pre_train_san_no_sv_vecs(self): - ft = FastText(min_count=1, size=5) + ft = FastText(min_count=1, vector_size=5) ft.build_vocab(SENTENCES) se = BaseSentence2VecModel(ft) se.sv.vectors = None @@ -283,7 +283,7 @@ def test_check_pre_train_san_no_sv_vecs(self): se._check_pre_training_sanity(1, 1, 1) def test_check_pre_train_san_no_word_weights(self): - ft = FastText(min_count=1, size=5) + ft = FastText(min_count=1, vector_size=5) ft.build_vocab(SENTENCES) se = BaseSentence2VecModel(ft) se.word_weights = None @@ -291,7 +291,7 @@ def test_check_pre_train_san_no_word_weights(self): se._check_pre_training_sanity(1, 1, 1) def test_check_pre_train_san_incos_len(self): - ft = FastText(min_count=1, size=5) + ft = FastText(min_count=1, vector_size=5) ft.build_vocab(SENTENCES) se = BaseSentence2VecModel(ft) se.word_weights = np.ones(20) @@ -299,42 +299,42 @@ def test_check_pre_train_san_incos_len(self): se._check_pre_training_sanity(1, 1, 1) def test_check_pre_train_dtypes(self): - ft = FastText(min_count=1, size=5) + ft = FastText(min_count=1, vector_size=5) ft.build_vocab(SENTENCES) se = BaseSentence2VecModel(ft) - se.wv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=np.float64) + se.wv.vectors = np.zeros((len(se.wv), 20), dtype=np.float64) with self.assertRaises(TypeError): se._check_pre_training_sanity(1, 1, 1) - se.wv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=np.float32) + se.wv.vectors = np.zeros((len(se.wv), 20), dtype=np.float32) - se.wv.vectors_ngrams = np.ones(len(se.wv.vocab), dtype=np.float16) + se.wv.vectors_ngrams = np.ones(len(se.wv), dtype=np.float16) with self.assertRaises(TypeError): se._check_pre_training_sanity(1, 1, 1) - se.wv.vectors_ngrams = np.ones(len(se.wv.vocab), dtype=np.float32) + se.wv.vectors_ngrams = np.ones(len(se.wv), dtype=np.float32) - se.wv.vectors_vocab = np.ones(len(se.wv.vocab), dtype=np.float16) + se.wv.vectors_vocab = np.ones(len(se.wv), dtype=np.float16) with self.assertRaises(TypeError): se._check_pre_training_sanity(1, 1, 1) - se.wv.vectors_vocab = np.ones(len(se.wv.vocab), dtype=np.float32) + se.wv.vectors_vocab = np.ones(len(se.wv), dtype=np.float32) - se.sv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=int) + se.sv.vectors = np.zeros((len(se.wv), 20), dtype=int) with self.assertRaises(TypeError): se._check_pre_training_sanity(1, 1, 1) - se.sv.vectors = np.zeros((len(se.wv.vocab), 20), dtype=np.float32) + se.sv.vectors = np.zeros((len(se.wv), 20), dtype=np.float32) - se.word_weights = np.ones(len(se.wv.vocab), dtype=bool) + se.word_weights = np.ones(len(se.wv), dtype=bool) with self.assertRaises(TypeError): se._check_pre_training_sanity(1, 1, 1) - se.word_weights = np.ones(len(se.wv.vocab), dtype=np.float32) + se.word_weights = np.ones(len(se.wv), dtype=np.float32) def test_check_pre_train_statistics(self): - ft = FastText(min_count=1, size=5) + ft = FastText(min_count=1, vector_size=5) ft.build_vocab(SENTENCES) se = BaseSentence2VecModel(ft) - for v in se.wv.vocab: - se.wv.vocab[v].count = 1 + for v in se.wv.key_to_index: + se.wv.set_vecattr(v, "count", 1) # Just throws multiple warnings warning se._check_pre_training_sanity(1, 1, 1) @@ -379,7 +379,7 @@ def test_move_w2v_vectors_to_disk_from_init(self): p_target.unlink() def test_move_ft_vectors_to_disk_from_init(self): - ft = FastText(min_count=1, size=DIM) + ft = FastText(min_count=1, vector_size=DIM) ft.build_vocab(SENTENCES) p = TEST_DATA / "test_vecs" diff --git a/test/test_sentencevectors.py b/test/test_sentencevectors.py index b52f9ce..5069b2e 100644 --- a/test/test_sentencevectors.py +++ b/test/test_sentencevectors.py @@ -23,12 +23,13 @@ TEST_DATA = Path(__file__).parent / "test_data" CORPUS = TEST_DATA / "test_sentences.txt" DIM = 5 -W2V = Word2Vec(min_count=1, size=DIM, seed=42) +W2V = Word2Vec(min_count=1, vector_size=DIM, seed=42) with open(CORPUS, "r") as file: SENTENCES = [l.split() for _, l in enumerate(file)] W2V.build_vocab(SENTENCES) -np.random.seed(42) -W2V.wv.vectors = np.random.uniform(size=W2V.wv.vectors.shape).astype(np.float32) + +rng = np.random.default_rng(12345) +W2V.wv.vectors = rng.uniform(size=W2V.wv.vectors.shape).astype(np.float32) class TestSentenceVectorsFunctions(unittest.TestCase): @@ -127,13 +128,19 @@ def test_most_similar(self): m = Average(W2V) m.train(sentences) o = m.sv.most_similar(positive=0) - self.assertEqual(45, o[0][0]) - self.assertEqual(35, o[1][0]) + self.assertEqual(50, o[0][0]) + self.assertEqual(58, o[1][0]) o = m.sv.most_similar(positive=0, indexable=sentences) - self.assertEqual("Looks good and fits snug", o[0][0]) + self.assertEqual( + "A basic phone which does what it is suppose to do , the thing i like most is the distinctive ring", + o[0][0], + ) o = m.sv.most_similar(positive=0, indexable=sent_ind) - self.assertEqual("Looks good and fits snug".split(), o[0][0][0]) + self.assertEqual( + "A basic phone which does what it is suppose to do , the thing i like most is the distinctive ring".split(), + o[0][0][0], + ) def test_most_similar_vec(self): sentences = IndexedLineDocument(CORPUS) @@ -142,19 +149,19 @@ def test_most_similar_vec(self): m.sv.init_sims() v = m.sv.get_vector(0, use_norm=True) o = m.sv.most_similar(positive=v) - # Includes 0 obviously - self.assertEqual(45, o[1][0]) - self.assertEqual(35, o[2][0]) + self.assertEqual(0, o[0][0]) + self.assertEqual(50, o[1][0]) + self.assertEqual(58, o[2][0]) - def test_most_similar_vecs(self): + def test_most_similar_vectors(self): sentences = IndexedLineDocument(CORPUS) m = Average(W2V) m.train(sentences) m.sv.init_sims() v = m.sv[[0, 1]] o = m.sv.most_similar(positive=v) - self.assertEqual(1, o[0][0]) - self.assertEqual(0, o[1][0]) + self.assertEqual(10, o[0][0]) + self.assertEqual(11, o[1][0]) def test_most_similar_wrong_indexable(self): def indexable(self): @@ -190,36 +197,36 @@ def test_most_similar_restrict_size_tuple(self): o = m.sv.most_similar(positive=1, topn=20, restrict_size=(5, 25)) self.assertEqual(20, len(o)) - self.assertEqual(9, o[0][0]) + self.assertEqual(11, o[0][0]) o = m.sv.most_similar( positive=1, topn=20, restrict_size=(5, 25), indexable=sentences ) self.assertEqual(20, len(o)) - self.assertEqual(9, o[0][1]) + self.assertEqual(11, o[0][1]) def test_similar_by_word(self): sentences = IndexedLineDocument(CORPUS) m = Average(W2V) m.train(sentences) o = m.sv.similar_by_word(word="the", wv=m.wv) - self.assertEqual(96, o[0][0]) + self.assertEqual(5, o[0][0]) o = m.sv.similar_by_word(word="the", wv=m.wv, indexable=sentences) - self.assertEqual(96, o[0][1]) + self.assertEqual(5, o[0][1]) def test_similar_by_vector(self): sentences = IndexedLineDocument(CORPUS) m = Average(W2V) m.train(sentences) o = m.sv.similar_by_vector(m.wv["the"]) - self.assertEqual(96, o[0][0]) + self.assertEqual(5, o[0][0]) def test_similar_by_sentence(self): sentences = IndexedLineDocument(CORPUS) m = Average(W2V) m.train(sentences) o = m.sv.similar_by_sentence(sentence=["the", "product", "is", "good"], model=m) - self.assertEqual(4, o[0][0]) + self.assertEqual(26, o[0][0]) def test_similar_by_sentence_wrong_model(self): sentences = IndexedLineDocument(CORPUS) diff --git a/test/test_sif.py b/test/test_sif.py index f22d12b..421fc54 100644 --- a/test/test_sif.py +++ b/test/test_sif.py @@ -13,7 +13,7 @@ TEST_DATA = Path(__file__).parent / "test_data" CORPUS = TEST_DATA / "test_sentences.txt" DIM = 50 -W2V = Word2Vec(min_count=1, size=DIM) +W2V = Word2Vec(min_count=1, vector_size=DIM) with open(CORPUS, "r") as file: SENTENCES = [l.split() for _, l in enumerate(file)] W2V.build_vocab(SENTENCES) @@ -98,7 +98,7 @@ def test_compute_sif_weights(self): alpha = self.model.alpha sif = alpha / (alpha + pw) - idx = self.model.wv.vocab[w].index + idx = self.model.wv.key_to_index[w] self.model._compute_sif_weights() self.assertTrue(np.allclose(self.model.word_weights[idx], sif)) @@ -121,14 +121,15 @@ def test_save_issue(self): model.sv.similar_by_sentence("test sentence".split(), model=model) def test_broken_vocab(self): - w2v = Word2Vec(min_count=1, size=DIM) + w2v = Word2Vec(min_count=1, vector_size=DIM) with open(CORPUS, "r") as file: w2v.build_vocab([l.split() for l in file]) - for k in w2v.wv.vocab: - w2v.wv.vocab[k].count = np.nan + + for k in w2v.wv.key_to_index: + w2v.wv.set_vecattr(k, "count", -1) model = SIF(w2v) - with self.assertRaises(RuntimeError): + with self.assertRaises(ValueError): model.train(self.sentences) diff --git a/test/test_usif.py b/test/test_usif.py index 3435b1b..6ea62de 100644 --- a/test/test_usif.py +++ b/test/test_usif.py @@ -12,7 +12,7 @@ CORPUS = Path(__file__).parent / "test_data" / "test_sentences.txt" DIM = 50 -W2V = Word2Vec(min_count=1, size=DIM) +W2V = Word2Vec(min_count=1, vector_size=DIM) with open(CORPUS, "r") as file: SENTENCES = [l.split() for _, l in enumerate(file)] W2V.build_vocab(SENTENCES) @@ -92,7 +92,7 @@ def test_dtype_sanity_svd_vecs(self): def test_compute_usif_weights(self): w = "Good" pw = 1.916650481770269e-08 - idx = self.model.wv.vocab[w].index + idx = self.model.wv.key_to_index[w] self.model.length = 11 a = 0.17831555484795414 usif = a / ((a / 2) + pw) @@ -105,16 +105,16 @@ def test_train(self): self.assertTrue(np.isfinite(self.model.sv.vectors).all()) def test_broken_vocab(self): - w2v = Word2Vec(min_count=1, size=DIM) + w2v = Word2Vec(min_count=1, vector_size=DIM) with open(CORPUS, "r") as file: w2v.build_vocab([l.split() for l in file]) - for k in w2v.wv.vocab: - w2v.wv.vocab[k].count = np.nan + for k in w2v.wv.key_to_index: + w2v.wv.set_vecattr(k, "count", -1) model = uSIF(w2v) - with self.assertRaises(RuntimeError): + with self.assertRaises(ValueError): model.train(self.sentences) def test_zero_div_error(self): diff --git a/tests.sh b/tests.sh new file mode 100644 index 0000000..09972ad --- /dev/null +++ b/tests.sh @@ -0,0 +1,21 @@ +#! /bin/bash + +export GENSIM_VERSION=4.0.0 +DOCKER_BUILDKIT=1 docker build -t fse-$GENSIM_VERSION --build-arg gensim=$GENSIM_VERSION . +docker run --rm "fse-$GENSIM_VERSION" + +export GENSIM_VERSION=4.0.1 +DOCKER_BUILDKIT=1 docker build -t fse-$GENSIM_VERSION --build-arg gensim=$GENSIM_VERSION . +docker run --rm "fse-$GENSIM_VERSION" + +export GENSIM_VERSION=4.1.0 +DOCKER_BUILDKIT=1 docker build -t fse-$GENSIM_VERSION --build-arg gensim=$GENSIM_VERSION . +docker run --rm "fse-$GENSIM_VERSION" + +export GENSIM_VERSION=4.1.1 +DOCKER_BUILDKIT=1 docker build -t fse-$GENSIM_VERSION --build-arg gensim=$GENSIM_VERSION . +docker run --rm "fse-$GENSIM_VERSION" + +export GENSIM_VERSION=4.1.2 +DOCKER_BUILDKIT=1 docker build -t fse-$GENSIM_VERSION --build-arg gensim=$GENSIM_VERSION . +docker run --rm "fse-$GENSIM_VERSION" \ No newline at end of file