remove unnecessary dependencies

saeeddhqan · Sep 19, 2024 · 2bb4ea2 · 2bb4ea2
1 parent e40c509
commit 2bb4ea2
Show file tree

Hide file tree

Showing 13 changed files with 53 additions and 205 deletions.
diff --git a/maryam.py b/maryam.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 
 from maryam import __main__
 import sys

diff --git a/maryam/core/util/iris/meta_search_util.py b/maryam/core/util/iris/meta_search_util.py
@@ -15,9 +15,11 @@
 from math import trunc
 
 class main:
+
 	def __init__(self):
 		self.framework = main.framework
 
+
 	def make_cite(self, url: 'URL string') -> 'cite':
 		urlib = self.framework.urlib(url)
 		path = urlib.path
@@ -40,6 +42,7 @@ def make_cite(self, url: 'URL string') -> 'cite':
 		cite = f"{host}{path}"
 		return cite
 
+
 	def remove_dups(self, res):
 		urls = []
 		new = []
@@ -50,6 +53,7 @@ def remove_dups(self, res):
 				new.append(i)
 		return new
 
+
 	def simple_merge(results) -> 'merging results based on quality of engines':
 		engines_len = len(results)
 		merged = []
@@ -65,14 +69,16 @@ def simple_merge(results) -> 'merging results based on quality of engines':
 
 		return merged
 
+
 	def compute_count_consensus( 
 			e: dict(type=list, help='list of search engines sorted by quality'),
-			l: dict(type=int, help='number of results')) -> 'a list of numbers':
+			l: dict(type=int, help='number of results'),
+		) -> 'a list of numbers':
 		x = len(e)
 		o = {}
 		for i in e:
-			o[i] = trunc(l/x)
-		l -= l - (l%x)
+			o[i] = trunc(l / x)
+		l -= l - (l % x)
 		if l != 0:
 			if l < x:
 				for i in range(l):

diff --git a/maryam/core/util/iris/retriever.py b/maryam/core/util/iris/retriever.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 """
 OWASP Maryam!
 This program is free software: you can redistribute it and/or modify
@@ -87,7 +86,7 @@ def __init__(
 		lowercase=True,
 		preprocessor=None,
 		tokenizer=None,
-		stop_words="english",
+		stop_words='english',
 		token_pattern=r"(?u)\b\w\w+\b",
 		ngram_range=(1, 2),
 		max_df=1,
@@ -124,11 +123,13 @@ def __init__(
 		)
 		self.vectorizer = vectorizer
 
+
 	def fit(self, df, y=None):
 		self.metadata = df
 		self.tfidf_matrix = self.vectorizer.fit_transform(list(map(' '.join, df["pages"])))
 		return self
 
+
 	def predict(self, query: str) -> 'OrderedDict':
 		"""
 		Compute the top_n closest documents given a query
@@ -160,10 +161,8 @@ def predict(self, query: str) -> 'OrderedDict':
 			table = prettytable.PrettyTable(["rank", "index", "title"])
 			for i in range(len(closest_docs_indices)):
 				index = closest_docs_indices[i]
-				# if self.paragraphs:
-				#	  article_index = self.paragraphs[int(index)]["index"]
-				#	  title = self.metadata.iloc[int(article_index)]["title"]
-				# else:
+
+
 				title = self.metadata.iloc[int(index)]["title"]
 				table.add_row([rank, index, title])
 				rank += 1

diff --git a/maryam/core/util/iris/safe_searcher.py b/maryam/core/util/iris/safe_searcher.py
diff --git a/maryam/core/util/iris/tf_histogram.py b/maryam/core/util/iris/tf_histogram.py
@@ -17,6 +17,7 @@
 from collections import Counter
 BASEDIR = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../'))
 
+
 class main:
 
 	def __init__(self, docs: 'documents', form: 'documet form. e.g html', without_punc=True):
@@ -33,18 +34,22 @@ def __init__(self, docs: 'documents', form: 'documet form. e.g html', without_pu
 		if self.without_punc:
 			self._punc()
 
+
 	def remove_stopwords(self, rest):
 		stops = open(os.path.join(BASEDIR, '../../', 'data', 'stopwords.csv')).read().split(',')
 		self.words = [x for x in self.words if x not in stops and x not in rest]
 
+
 	def _punc(self):
 		self.words = re.findall(r"[\w\-_#]{2,}", self.docs)
 
+
 	def _counter(self, last):
 		""" last: number of terms to show in plot """
 		bow = Counter(self.words)
 		return bow.most_common(last)
 
+
 	def plot_histogram(self, title, last, should_show=False):
 		import pandas as pd
 		import matplotlib.pyplot as plt

diff --git a/maryam/core/util/iris/topic.py b/maryam/core/util/iris/topic.py
@@ -11,21 +11,22 @@
 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 """
-# core/util/iris/topic.py
 # Hatma Suryotrisongko
 
+import os
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 from sentence_transformers import SentenceTransformer
+BASEDIR = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../'))
 
 class main:
 
 	def __init__(self, inputfile, filetype, keyword, showcharts, verbose):
 
 		from dask import dataframe as dd
 		import json
-		from gensim.parsing.preprocessing import remove_stopwords
+		self.stops = open(os.path.join(BASEDIR, '../../', 'data', 'stopwords.csv')).read().split(',')
 
 		if verbose == True:
 			print("\n\n DATASET = reading file : " + inputfile)
@@ -42,7 +43,7 @@ def __init__(self, inputfile, filetype, keyword, showcharts, verbose):
 				print("\n\n csv file (before preprocessing) = ")
 				print(tmp4)
 
-			self.corpus = tmp4[0].str.lower().apply(remove_stopwords).to_numpy()
+			self.corpus = tmp4[0].str.lower().apply(self.remove_stopwords).to_numpy()
 
 		elif filetype == "json":
 			with open(inputfile) as json_file:
@@ -55,7 +56,7 @@ def __init__(self, inputfile, filetype, keyword, showcharts, verbose):
 				print(tmp)
 
 			tmp['td'] = tmp['t'] + ' ' + tmp['d']
-			self.corpus = tmp['td'].str.lower().apply(remove_stopwords).to_numpy()
+			self.corpus = tmp['td'].str.lower().apply(self.remove_stopwords).to_numpy()
 
 		else:
 			print('ERROR, only accept csv or json file!')
@@ -73,6 +74,11 @@ def __init__(self, inputfile, filetype, keyword, showcharts, verbose):
 			pd.Series([len(e.split()) for e in self.corpus]).hist()
 			plt.show()
 
+
+	def remove_stopwords(self, text):
+		return ''.join([x for x in text if x not in self.stops])
+
+
 	def run_sklearn_cluster_kmeans(self, selected_pretrained_model, showcharts, verbose):
 
 		from sklearn.cluster import KMeans

diff --git a/maryam/core/util/iris/word_cloud.py b/maryam/core/util/iris/word_cloud.py
diff --git a/maryam/modules/iris/iris.py b/maryam/modules/iris/iris.py
@@ -21,7 +21,7 @@
 	'description': 'Iris is a built-in meta search engine.',
 	'comments': ('It should be note that this is a beta version and has many bugs!',),
 	'contributors': 'Aman, Dimitris, Divya, Vikas, Kunal',
-	'sources': ('google', 'bing', 'duckduckgo', 'etools', 'startpage', 'searx', 'yahoo'),
+	'sources': ('google', 'bing', 'duckduckgo', 'etools', 'startpage', 'yahoo'),
 	'options': (
 		('query', None, True, 'Query string', '-q', 'store', str),
 	),

diff --git a/maryam/modules/iris/iris_cluster.py b/maryam/modules/iris/iris_cluster.py
@@ -13,11 +13,11 @@
 """
 
 meta = {
-	'name': 'Iris_Cluster',
+	'name': 'Iris Cluster',
 	'author': 'Shaad',
 	'version': '0.1',
 	'description': 'Get Iris Search result and clustered results for your query',
-	'required': ('kneed', 'mlxtend, numpy, sklearn'),
+	'required': ('kneed', 'mlxtend', 'numpy', 'sklearn'),
 	'options': (
 			('query', None, True, 'Query string', '-q', 'store', str),
 		),
@@ -51,7 +51,7 @@ def module_run(self):
 
 	print('\n\nCLUSTER RESULT: ')
 	for index, title in enumerate(output):
-	    print('\n')
-	    print(f"CLUSTER {index+1}")
-	    print(f"TITLE: {title}")
-	    print('  '+'\n  '.join(output[title]))
+		print('\n')
+		print(f"CLUSTER {index+1}")
+		print(f"TITLE: {title}")
+		print('  '+'\n  '.join(output[title]))
diff --git a/maryam/modules/iris/sentiment.py b/maryam/modules/iris/sentiment.py
@@ -70,7 +70,7 @@ def module_api(self):
 			return
 		DATA = loads(file.read())
 		if key not in DATA and key != None:
-			self.error("The key doesn't exists", 'module_api', 'iris/sentiment')
+			self.error('The key doesn\'t exists', 'module_api', 'iris/sentiment')
 			return
 		if key != None:
 			DATA = DATA[key]

diff --git a/maryam/modules/iris/topicmodeling.py b/maryam/modules/iris/topicmodeling.py
@@ -60,4 +60,4 @@ def module_api(self):
 def module_run(self):
 	output = module_api(self)
 	self.output("\n\nOutput = \n")
-	self.output( output )
+	self.output( output )