forked from RiskThinking/work-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Lakpa Sherpa
authored and
Lakpa Sherpa
committed
May 29, 2023
1 parent
5319652
commit b9794da
Showing
100 changed files
with
237,880 additions
and
110,720 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,16 @@ | ||
FROM apache/airflow:2.3.0 | ||
|
||
USER root | ||
RUN apt-get update && \ | ||
apt-get install -y --no-install-recommends openjdk-11-jre-headless && \ | ||
apt-get autoremove -yqq --purge && \ | ||
apt-get clean; | ||
|
||
|
||
USER airflow | ||
ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-arm64 | ||
|
||
COPY requirements.txt . | ||
RUN pip install -r requirements.txt | ||
RUN pip install -r requirements.txt | ||
|
||
COPY --chown=airflow:root ./dags /opt/airflow/dags |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
FROM python:3.8-alpine | ||
|
||
WORKDIR /app | ||
|
||
# Install app dependencies | ||
RUN apk add bash openjdk11 --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community | ||
|
||
ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk | ||
|
||
RUN apk --no-cache add musl-dev linux-headers g++ | ||
|
||
COPY api /app | ||
RUN pip install --upgrade pip | ||
RUN pip install -r /app/requirements.txt | ||
|
||
|
||
CMD ["python","/app/app.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
FROM bitnami/spark:latest | ||
|
||
USER root | ||
# RUN curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.1/hadoop-aws-3.3.1.jar --output /opt/bitnami/spark/jars/hadoop-aws-3.3.1.jar | ||
|
||
# COPY spark/app /usr/local/spark/app | ||
COPY requirements_spark.txt . | ||
RUN pip install -r requirements_spark.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
flask | ||
numpy | ||
# scikit-learn | ||
pyspark |
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
from datetime import timedelta, datetime | ||
import os | ||
import pickle | ||
import random | ||
import pandas as pd | ||
from pyspark.sql.functions import lit | ||
from pyspark.sql.functions import input_file_name | ||
from pyspark.sql.functions import element_at, split, col | ||
|
||
|
||
from airflow import DAG | ||
from airflow.utils.task_group import TaskGroup | ||
from airflow.operators.python_operator import PythonOperator | ||
from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator | ||
from airflow.utils.dates import days_ago | ||
|
||
default_args = { | ||
'owner': 'airflow', | ||
'start_date': days_ago(1) | ||
} | ||
|
||
today = datetime.today().strftime('%Y%m%d') | ||
stage_path = "/usr/local/spark/staging/" | ||
|
||
spark_dag = DAG( | ||
dag_id = "stock_spark_airflow", | ||
default_args=default_args, | ||
schedule_interval=None, | ||
dagrun_timeout=timedelta(minutes=60*24), | ||
description='use case of sparkoperator in airflow', | ||
) | ||
|
||
|
||
def verify_raw_data_path_func(): | ||
output_path = stage_path + today + "/raw_data_processing" | ||
|
||
# Check whether the specified path exists or not | ||
if not os.path.exists(output_path): | ||
# Create a new directory because it does not exist | ||
os.makedirs(output_path) | ||
print("The new directory is created: ", output_path) | ||
|
||
def verify_feature_data_path_func(): | ||
output_path = stage_path + today + "/feature_engineering" | ||
|
||
# Check whether the specified path exists or not | ||
if not os.path.exists(output_path): | ||
# Create a new directory because it does not exist | ||
os.makedirs(output_path) | ||
print("The new directory is created!") | ||
|
||
with spark_dag: | ||
with TaskGroup("raw_data_processing", tooltip="Tasks for raw_data_processing") as task_group_raw_data_processing: | ||
verify_raw_data_path = PythonOperator( | ||
task_id="verify_raw_data_path", | ||
python_callable = verify_raw_data_path_func) | ||
|
||
stock_data_processing = SparkSubmitOperator( | ||
application = "/usr/local/spark/app/raw_data_processing.py", | ||
conn_id= 'spark_local', | ||
task_id='stock_data_processing', | ||
application_args=['stocks'], | ||
) | ||
|
||
verify_raw_data_path >> stock_data_processing | ||
|
||
etf_data_processing = SparkSubmitOperator( | ||
application = "/usr/local/spark/app/raw_data_processing.py", | ||
conn_id= 'spark_local', | ||
task_id='etf_data_processing', | ||
application_args=['etfs'], | ||
) | ||
|
||
verify_raw_data_path >> etf_data_processing | ||
|
||
with TaskGroup("feature_engineering", tooltip="Tasks for feature_engineering") as task_group_feature_engineering: | ||
task_verify_feature_data_path = PythonOperator( | ||
task_id='verify_feature_data_path', | ||
python_callable = verify_feature_data_path_func | ||
) | ||
|
||
stock_feature_engineering = SparkSubmitOperator( | ||
application = "/usr/local/spark/app/feature_engineering_processing.py", | ||
conn_id= 'spark_local', | ||
task_id='stock_feature_processing', | ||
application_args=['stocks'], | ||
) | ||
|
||
etf_feature_engineering = SparkSubmitOperator( | ||
application = "/usr/local/spark/app/feature_engineering_processing.py", | ||
conn_id= 'spark_local', | ||
task_id='etf_feature_processing', | ||
application_args=['etfs'], | ||
) | ||
|
||
task_verify_feature_data_path >> [stock_feature_engineering, etf_feature_engineering] | ||
|
||
train_model = SparkSubmitOperator( | ||
application = "/usr/local/spark/app/train_model.py", | ||
conn_id= 'spark_local', | ||
task_id='train_model', | ||
) | ||
|
||
task_group_raw_data_processing >> task_group_feature_engineering >> train_model |
Oops, something went wrong.