Skip to content

Commit

Permalink
filter out non-english (#124)
Browse files Browse the repository at this point in the history
* filter out non-english

* add language dectection using spacy and pass in additional parameters
  • Loading branch information
longshuicy authored Aug 8, 2024
1 parent e7acf92 commit 8dc9137
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 2 deletions.
4 changes: 4 additions & 0 deletions containerized_analytics/smile/topic_modeling/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.1.6] - 07-16-2024

### Changed
- Add language detection to filter out non-English text [#123](https://github.com/ncsa/standalone-smm-analytics/issues/123)

## [0.1.5] - 01-23-2024

Expand Down
1 change: 1 addition & 0 deletions containerized_analytics/smile/topic_modeling/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ ENV RABBITMQ_HOST="rabbitmq"
# install dependency libraries and download required data
RUN pip install --no-cache-dir -r requirement.txt \
&& python3 -m nltk.downloader -d /usr/local/share/nltk_data stopwords wordnet \
&& python3 -m spacy download en_core_web_sm \
# cron job clean tmp folder
&& chmod u+x ./clear_cache.sh \
&& chmod 0644 ./clear_cache_cron \
Expand Down
13 changes: 12 additions & 1 deletion containerized_analytics/smile/topic_modeling/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,18 @@ def algorithm(df, params):

output = {}

gensim_tm = Gensim_Topic_Modeling(df, column=params["column"])
# Check if english_only and language_score exist in params
english_only_param = params["english_only"] if "english_only" in params else True
language_score_param = params["language_score"] if "language_score" in params else 0.9

# Call the Gensim_Topic_Modeling function
gensim_tm = Gensim_Topic_Modeling(
df,
column=params["column"],
english_only=english_only_param,
language_score=language_score_param
)

data_lemmatized, id2word, corpus = gensim_tm.preprocessing()
output['lemmatized'] = data_lemmatized

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,36 @@
from nltk import WordNetLemmatizer
import pyLDAvis
import pyLDAvis.gensim
import spacy
from spacy_langdetect import LanguageDetector
from spacy.language import Language


class Gensim_Topic_Modeling:

def __init__(self, df, column):
def __init__(self, df, column, english_only=True, language_score=0.9):
self.data = df[df[column] != ''][column].dropna().astype(
'str').tolist()

# Load a SpaCy model
self.nlp = spacy.load('en_core_web_sm')

# Add the language detector to the pipeline
@Language.factory("language_detector")
def get_lang_detector(nlp, name):
return LanguageDetector()

self.nlp.add_pipe('language_detector', last=True)
self.english_only = english_only
self.language_score = language_score

def preprocessing(self):
# Detect and keep only English texts
if self.english_only:
self.data = [sent for sent in self.data if
self.nlp(sent)._.language['language'] == 'en'
and self.nlp(sent)._.language['score'] > self.language_score]

self.data = [re.sub('\S*@\S*\s?', "", sent) for sent in self.data]
self.data = [re.sub('\s+', ' ', sent) for sent in self.data]
self.data = [re.sub("\'", "", sent) for sent in self.data]
Expand Down
2 changes: 2 additions & 0 deletions containerized_analytics/smile/topic_modeling/requirement.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@ numpy>=1.18.1
pandas>=1.1.4
pyLDAvis==2.1.2
pika>=1.1.0
spacy==3.7.5
spacy-langdetect==0.1.2

0 comments on commit 8dc9137

Please sign in to comment.