Skip to content

Commit efd9954

Browse files
committed
re-worked how configuration variables are handled
1 parent 2fa09aa commit efd9954

5 files changed

+46
-41
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
configurations.sh

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ After the model is trained in pyspark, these data are stored in a postgres datab
1313

1414
## Directory Structure
1515
|-- README.md
16+
|-- configurations_template.sh
1617
|-- dash
1718
| -- topicMakr_app.py
1819
|-- data

configurations_template.sh

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Set configurations
2+
3+
spark_master_ui = "spark://" + "spark master node public IP" + ":6066"
4+
5+
publicDNS = "spark master public DNS" + ":7077"
6+
bucket = "s3 bucket with book files"
7+
bucketfolder = "folder path in s3 bucket containing book files"
8+
9+
k = "integer number of topics for LDA to learn"
10+
maxIter = "integer number of iterations of LDA training"
11+
12+
topwords = "integer number of top words per topic to store in postgres table"
13+
postgresURL = "jdbc:postgresql://" + "EC2 instance with postgres database" + "/" + "database name"

src/topicMakr_pyspark.py

+18-40
Original file line numberDiff line numberDiff line change
@@ -38,27 +38,27 @@ def aws_access(*argv):
3838
return aws_access_key_id, aws_secret_access_key
3939

4040

41-
def s3_to_pyspark(config, aws_access_key_id, aws_secret_access_key):
41+
def s3_to_pyspark(aws_access_key_id, aws_secret_access_key):
4242
"""
4343
Set up spark context and s3 bucket and folder config
4444
"""
4545
conf = SparkConf()
46-
conf.setMaster(config["publicDNS"])
46+
conf.setMaster(os.getenv("publicDNS"))
4747
conf.setAppName("topicMakr")
4848
sc = SparkContext(conf=conf)
4949
sqlContext = SQLContext(sc)
5050
# Connect to bucket with boto3
5151
s3 = boto3.resource('s3')
52-
bucket = s3.Bucket(config["bucket"])
52+
bucket = s3.Bucket(os.getenv("bucket"))
5353
# Loop through all files and create a file list
5454
filelist = []
55-
for obj in bucket.objects.filter(Prefix=config["bucketfolder"]):
55+
for obj in bucket.objects.filter(Prefix=os.getenv("bucketfolder")):
5656
if obj.size:
5757
filelist.append("s3n://" + bucket.name + "/" + obj.key)
5858

5959
# Filter list to just books (named with numbers as per project gutenberg)
6060
filelist = fnmatch.filter(filelist, "s3n://" + bucket.name + "/" +
61-
config["bucketfolder"] + "[0-9]*.txt")
61+
os.getenv("bucketfolder") + "[0-9]*.txt")
6262

6363
def preproc(iterator):
6464
"""
@@ -117,7 +117,7 @@ def preproc(iterator):
117117
return sqlContext, tokens, titles
118118

119119

120-
def books_to_lda(ldaparam, sqlContext, tokens, titles):
120+
def books_to_lda(sqlContext, tokens, titles):
121121
"""
122122
Convert tokens to TF-IDF and run LDA model
123123
"""
@@ -144,13 +144,12 @@ def books_to_lda(ldaparam, sqlContext, tokens, titles):
144144
result_tfidf = idfModel.transform(result_cv)
145145

146146
# Run LDA model
147-
lda = LDA(k=ldaparam["k"], maxIter=ldaparam["maxIter"])
147+
lda = LDA(k=os.getenv("k"), maxIter=os.getenv("maxIter"))
148148
model = lda.fit(result_tfidf)
149149
return vocab, result_tfidf, model
150150

151151

152-
def postgres_tables(SQLconf, ldaparam, vocab,
153-
result_tfidf, model, sqlContext, titles):
152+
def postgres_tables(vocab, result_tfidf, model, sqlContext, titles):
154153
"""
155154
Set up tables and write to postgres
156155
"""
@@ -177,7 +176,7 @@ def postgres_tables(SQLconf, ldaparam, vocab,
177176
.cast(StringType()))
178177

179178
# Get top 7 words per topic
180-
topics = model.describeTopics(maxTermsPerTopic=SQLconf["topwords"])
179+
topics = model.describeTopics(maxTermsPerTopic=os.getenv("topwords"))
181180
#
182181
# Add vocab to topics dataframe
183182
topics_rdd = topics.rdd
@@ -195,46 +194,25 @@ def postgres_tables(SQLconf, ldaparam, vocab,
195194

196195
# Save dataframes to postgreSQL database on postgres_DB ec2 instance
197196
topics.write.format('jdbc') \
198-
.options(url=SQLconf["postgresURL"],
197+
.options(url=os.getenv("postgresURL"),
199198
driver='org.postgresql.Driver', dbtable='topics') \
200199
.mode('overwrite').save()
201200

202201
top_doc_table.write.format('jdbc') \
203-
.options(url=SQLconf["postgresURL"],
202+
.options(url=os.getenv("postgresURL"),
204203
driver='org.postgresql.Driver', dbtable='documents') \
205204
.mode('overwrite').save()
206205

207206

208-
# Set configurations
209-
config = {
210-
"publicDNS": "spark://ec2-54-227-182-209.compute-1.amazonaws.com:7077",
211-
"bucket": "maxcantor-insight-deny2019a-bookbucket",
212-
"bucketfolder": "gutenberg_data/unzipped_data/"
213-
}
214-
215-
ldaparam = {
216-
"k": 20,
217-
"maxIter": 100
218-
}
219-
220-
SQLconf = {
221-
"topwords": 7,
222-
"postgresURL": "jdbc:postgresql://" +
223-
"ec2-54-205-173-0.compute-1.amazonaws.com/lda_booktopics"
224-
}
225-
226207
if __name__ == '__main__':
227208
"""
228209
Run pipeline functions
229210
"""
230211
[aws_access_key_id, aws_secret_access_key] = aws_access(*sys.argv)
231-
[sqlContext, tokens, titles] = s3_to_pyspark(
232-
config,
233-
aws_access_key_id,
234-
aws_secret_access_key)
235-
[vocab, result_tfidf, model] = books_to_lda(
236-
ldaparam, sqlContext,
237-
tokens, titles)
238-
postgres_tables(
239-
SQLconf, ldaparam, vocab, result_tfidf,
240-
model, sqlContext, titles)
212+
213+
[sqlContext, tokens, titles] = s3_to_pyspark(aws_access_key_id,
214+
aws_secret_access_key)
215+
216+
[vocab, result_tfidf, model] = books_to_lda(sqlContext, tokens, titles)
217+
218+
postgres_tables(vocab, result_tfidf, model, sqlContext, titles)

topicMakr_sparksubmit.sh

+13-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,19 @@
11
#!/usr/bin/env bash
22

3+
source ./configurations.sh
4+
5+
export publicDNS
6+
export bucket
7+
export bucketfolder
8+
9+
export k
10+
export maxIter
11+
12+
export topwords
13+
export postgresURL
14+
315
spark-submit \
416
--jars /home/ubuntu/postgresql-42.2.5.jar \
517
--class ranking \
6-
--master spark://54.227.182.209:6066 \
18+
--master $spark_master_ui \
719
/home/ubuntu/topicMakr_pyspark.py

0 commit comments

Comments
 (0)