diff --git a/containerized_analytics/smile/topic_modeling/CHANGELOG.md b/containerized_analytics/smile/topic_modeling/CHANGELOG.md index 7d167e7..89c98d4 100644 --- a/containerized_analytics/smile/topic_modeling/CHANGELOG.md +++ b/containerized_analytics/smile/topic_modeling/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.1.6] - 07-16-2024 + +### Changed +- Add language detection to filter out non-English text [#123](https://github.com/ncsa/standalone-smm-analytics/issues/123) ## [0.1.5] - 01-23-2024 diff --git a/containerized_analytics/smile/topic_modeling/Dockerfile b/containerized_analytics/smile/topic_modeling/Dockerfile index 82fb3be..477e3fd 100644 --- a/containerized_analytics/smile/topic_modeling/Dockerfile +++ b/containerized_analytics/smile/topic_modeling/Dockerfile @@ -12,6 +12,7 @@ ENV RABBITMQ_HOST="rabbitmq" # install dependency libraries and download required data RUN pip install --no-cache-dir -r requirement.txt \ && python3 -m nltk.downloader -d /usr/local/share/nltk_data stopwords wordnet \ +&& python3 -m spacy download en_core_web_sm \ # cron job clean tmp folder && chmod u+x ./clear_cache.sh \ && chmod 0644 ./clear_cache_cron \ diff --git a/containerized_analytics/smile/topic_modeling/algorithm.py b/containerized_analytics/smile/topic_modeling/algorithm.py index 5b98ce8..a3c8664 100644 --- a/containerized_analytics/smile/topic_modeling/algorithm.py +++ b/containerized_analytics/smile/topic_modeling/algorithm.py @@ -12,7 +12,18 @@ def algorithm(df, params): output = {} - gensim_tm = Gensim_Topic_Modeling(df, column=params["column"]) + # Check if english_only and language_score exist in params + english_only_param = params["english_only"] if "english_only" in params else True + language_score_param = params["language_score"] if "language_score" in params else 0.9 + + # Call the Gensim_Topic_Modeling function + gensim_tm = Gensim_Topic_Modeling( + df, + column=params["column"], + english_only=english_only_param, + language_score=language_score_param + ) + data_lemmatized, id2word, corpus = gensim_tm.preprocessing() output['lemmatized'] = data_lemmatized diff --git a/containerized_analytics/smile/topic_modeling/gensim_topic_modeling.py b/containerized_analytics/smile/topic_modeling/gensim_topic_modeling.py index 356b573..d96b0be 100644 --- a/containerized_analytics/smile/topic_modeling/gensim_topic_modeling.py +++ b/containerized_analytics/smile/topic_modeling/gensim_topic_modeling.py @@ -7,15 +7,36 @@ from nltk import WordNetLemmatizer import pyLDAvis import pyLDAvis.gensim +import spacy +from spacy_langdetect import LanguageDetector +from spacy.language import Language class Gensim_Topic_Modeling: - def __init__(self, df, column): + def __init__(self, df, column, english_only=True, language_score=0.9): self.data = df[df[column] != ''][column].dropna().astype( 'str').tolist() + # Load a SpaCy model + self.nlp = spacy.load('en_core_web_sm') + + # Add the language detector to the pipeline + @Language.factory("language_detector") + def get_lang_detector(nlp, name): + return LanguageDetector() + + self.nlp.add_pipe('language_detector', last=True) + self.english_only = english_only + self.language_score = language_score + def preprocessing(self): + # Detect and keep only English texts + if self.english_only: + self.data = [sent for sent in self.data if + self.nlp(sent)._.language['language'] == 'en' + and self.nlp(sent)._.language['score'] > self.language_score] + self.data = [re.sub('\S*@\S*\s?', "", sent) for sent in self.data] self.data = [re.sub('\s+', ' ', sent) for sent in self.data] self.data = [re.sub("\'", "", sent) for sent in self.data] diff --git a/containerized_analytics/smile/topic_modeling/requirement.txt b/containerized_analytics/smile/topic_modeling/requirement.txt index ca7e73c..f4962d9 100644 --- a/containerized_analytics/smile/topic_modeling/requirement.txt +++ b/containerized_analytics/smile/topic_modeling/requirement.txt @@ -5,3 +5,5 @@ numpy>=1.18.1 pandas>=1.1.4 pyLDAvis==2.1.2 pika>=1.1.0 +spacy==3.7.5 +spacy-langdetect==0.1.2