-
Notifications
You must be signed in to change notification settings - Fork 27
End to End Processing
George Richardson edited this page May 3, 2017
·
3 revisions
This assumes you are working in a Notebook.
import os
import sys
from sqlalchemy import create_engine
from sqlalchemy import exc
from sqlalchemy import func
from sqlalchemy import Table, text
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
sys.path.append(module_path)
from internal_displacement.model.model import Status, Session, Category, Article, Content, Country, CountryTerm, \
Location, Report, ReportDateSpan, ArticleCategory, Base
db_url = 'postgresql://{user}:{password}@{db_host}/{db}'.format(
user='jupyter', password='jupyter', db_host=db_host, db='id')
engine = create_engine(db_url)
Session.configure(bind=engine)
session = Session()
import spacy
from internal_displacement.scraper import Scraper
from internal_displacement.interpreter import Interpreter
from internal_displacement.pipeline import Pipeline
from internal_displacement.add_countries import load_countries, delete_countries
import pandas as pd
load_countries(session)
scraper = Scraper()
nlp = spacy.load('en')
person_reporting_terms = [
'displaced', 'evacuated', 'forced', 'flee', 'homeless', 'relief camp',
'sheltered', 'relocated', 'stranded', 'stuck', 'stranded', "killed", "dead", "died", "drown"
]
structure_reporting_terms = [
'destroyed', 'damaged', 'swept', 'collapsed',
'flooded', 'washed', 'inundated', 'evacuate'
]
person_reporting_units = ["families", "person", "people", "individuals", "locals", "villagers", "residents",
"occupants", "citizens", "households", "life"]
structure_reporting_units = ["home", "house", "hut", "dwelling", "building", "shop", "business", "apartment",
"flat", "residence"]
relevant_article_terms = ['Rainstorm', 'hurricane',
'tornado', 'rain', 'storm', 'earthquake']
relevant_article_lemmas = [t.lemma_ for t in nlp(
" ".join(relevant_article_terms))]
data_path = '../data'
interpreter = Interpreter(nlp, person_reporting_terms, structure_reporting_terms, person_reporting_units,
structure_reporting_units, relevant_article_lemmas, data_path,
model_path='../internal_displacement/classifiers/default_model.pkl',
encoder_path='../internal_displacement/classifiers/default_encoder.pkl')
pipeline = Pipeline(session, scraper, interpreter)
for url in url_list:
try:
pipeline.process_url(url)
except exc.IntegrityError:
session.rollback()
print("{} articles in database".format(session.query(Article.id).count()))
article_stats = session.query(Article.status, func.count(Article.status)).group_by(Article.status).all()
print("Article statuses:")
for status, ct in article_stats:
print("{}: {}".format(status, ct))