filter out non-english (#124)

* filter out non-english * add language dectection using spacy and pass in additional parameters
ncsa · Aug 8, 2024 · 8dc9137 · 8dc9137
1 parent e7acf92
commit 8dc9137
Show file tree

Hide file tree

Showing 5 changed files with 41 additions and 2 deletions.
diff --git a/containerized_analytics/smile/topic_modeling/CHANGELOG.md b/containerized_analytics/smile/topic_modeling/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.1.6] - 07-16-2024
+
+### Changed
+- Add language detection to filter out non-English text [#123](https://github.com/ncsa/standalone-smm-analytics/issues/123)
 
 ## [0.1.5] - 01-23-2024
 

diff --git a/containerized_analytics/smile/topic_modeling/Dockerfile b/containerized_analytics/smile/topic_modeling/Dockerfile
@@ -12,6 +12,7 @@ ENV RABBITMQ_HOST="rabbitmq"
 # install dependency libraries and download required data
 RUN pip install --no-cache-dir -r requirement.txt \
 && python3 -m nltk.downloader -d /usr/local/share/nltk_data stopwords wordnet \
+&& python3 -m spacy download en_core_web_sm \
 # cron job clean tmp folder
 && chmod u+x ./clear_cache.sh \
 && chmod 0644 ./clear_cache_cron \

diff --git a/containerized_analytics/smile/topic_modeling/algorithm.py b/containerized_analytics/smile/topic_modeling/algorithm.py
@@ -12,7 +12,18 @@ def algorithm(df, params):
 
     output = {}
 
-    gensim_tm = Gensim_Topic_Modeling(df, column=params["column"])
+    # Check if english_only and language_score exist in params
+    english_only_param = params["english_only"] if "english_only" in params else True
+    language_score_param = params["language_score"] if "language_score" in params else 0.9
+
+    # Call the Gensim_Topic_Modeling function
+    gensim_tm = Gensim_Topic_Modeling(
+        df,
+        column=params["column"],
+        english_only=english_only_param,
+        language_score=language_score_param
+    )
+
     data_lemmatized, id2word, corpus = gensim_tm.preprocessing()
     output['lemmatized'] = data_lemmatized
 

diff --git a/containerized_analytics/smile/topic_modeling/gensim_topic_modeling.py b/containerized_analytics/smile/topic_modeling/gensim_topic_modeling.py
@@ -7,15 +7,36 @@
 from nltk import WordNetLemmatizer
 import pyLDAvis
 import pyLDAvis.gensim
+import spacy
+from spacy_langdetect import LanguageDetector
+from spacy.language import Language
 
 
 class Gensim_Topic_Modeling:
 
-    def __init__(self, df, column):
+    def __init__(self, df, column, english_only=True, language_score=0.9):
         self.data = df[df[column] != ''][column].dropna().astype(
             'str').tolist()
 
+        # Load a SpaCy model
+        self.nlp = spacy.load('en_core_web_sm')
+
+        # Add the language detector to the pipeline
+        @Language.factory("language_detector")
+        def get_lang_detector(nlp, name):
+            return LanguageDetector()
+
+        self.nlp.add_pipe('language_detector', last=True)
+        self.english_only = english_only
+        self.language_score = language_score
+
     def preprocessing(self):
+        # Detect and keep only English texts
+        if self.english_only:
+            self.data = [sent for sent in self.data if
+                    self.nlp(sent)._.language['language'] == 'en'
+                         and self.nlp(sent)._.language['score'] > self.language_score]
+
         self.data = [re.sub('\S*@\S*\s?', "", sent) for sent in self.data]
         self.data = [re.sub('\s+', ' ', sent) for sent in self.data]
         self.data = [re.sub("\'", "", sent) for sent in self.data]

diff --git a/containerized_analytics/smile/topic_modeling/requirement.txt b/containerized_analytics/smile/topic_modeling/requirement.txt
@@ -5,3 +5,5 @@ numpy>=1.18.1
 pandas>=1.1.4
 pyLDAvis==2.1.2
 pika>=1.1.0
+spacy==3.7.5
+spacy-langdetect==0.1.2