Skip to content

Working with Excerpts

Simon Bedford edited this page Apr 27, 2017 · 3 revisions

This assumes you are working in a Notebook.

1. Load required modules:

%load_ext autoreload
%autoreload 2
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import pandas as pd
import spacy
from sklearn.metrics import classification_report
from internal_displacement.interpreter import Interpreter
from internal_displacement.excerpt_helper import Helper
from internal_displacement.excerpt_helper import MeanEmbeddingVectorizer
import gensim

2. Set up arguments for the Interpreter

nlp = spacy.load('en')
person_reporting_terms = [
    'displaced', 'evacuated', 'forced', 'flee', 'homeless', 'relief camp',
    'sheltered', 'relocated', 'stranded', 'stuck', 'accommodated']

structure_reporting_terms = [
    'destroyed', 'damaged', 'swept', 'collapsed',
    'flooded', 'washed', 'inundated', 'evacuate'
]

person_reporting_units = ["families", "person", "people", "individuals", "locals", "villagers", "residents",
                            "occupants", "citizens", "households"]

structure_reporting_units = ["home", "house", "hut", "dwelling", "building"]

relevant_article_terms = ['Rainstorm', 'hurricane',
                          'tornado', 'rain', 'storm', 'earthquake']
relevant_article_lemmas = [t.lemma_ for t in nlp(
    " ".join(relevant_article_terms))]

data_path = '../data'

3. Initialize the interpreter and helper:

interpreter = Interpreter(nlp, person_reporting_terms, structure_reporting_terms, person_reporting_units,
                          structure_reporting_units, relevant_article_lemmas, data_path,
                          model_path='../internal_displacement/classifiers/default_model.pkl',
                          encoder_path='../internal_displacement/classifiers/default_encoder.pkl')

helper = Helper(nlp, '../internal_displacement/classifiers/unit_vectorizer.pkl', 
               '../internal_displacement/classifiers/unit_model.pkl',
               '../internal_displacement/classifiers/term_vectorizer.pkl',
               '../internal_displacement/classifiers/term_model.pkl',
               '../internal_displacement/classifiers/terem_svc.pkl')

4. Load the pre-trained Word2Vec model

w2v = gensim.models.KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin', binary=True)

5. Get reports and choose the most likely one:

cleaned_excerpt = helper.cleanup(excerpt)
reports = interpreter.process_article_new(cleaned_excerpt)
top_report = helper.get_report(reports)

6. Extracting Reporting Unit:

unit_rules = top_report[2]
excerpt_features = helper.reporting_unit_vectorizer.transform(cleaned_excerpt)
unit_ml =  helper.reporting_unit_classifier.predict(excerpt_features)
unit_prediction = helper.combine_predictions(unit_ml, unit_rules)

7. Extracting Reporting Term:

w2vVectorizer = MeanEmbeddingVectorizer(w2v)
term_rules = top_report[1]

excerpt_features_1 = helper.reporting_term_vectorizer.transform(cleaned_excerpt)
p1 = helper.reporting_term_classifier.predict_proba(excerpt_features_1)

excerpt_features_2 = w2vVectorizer.transform(cleaned_excerpt)
p2 = helper.reporting_term_svc.predict_proba(excerpt_features_2)

term_ml = helper.combine_probabilities(p1, p2, helper.reporting_term_classifier.classes_)

term_prediction = helper.combine_predictions(term_ml, term_rules)

8. Set up lemmas for extracting quantity:

person_units = ["person", "people", "individuals", "locals", "villagers", "residents",
                "occupants", "citizens", "IDP"]

household_units = ["home", "house", "hut", "dwelling", "building", "families", "households"]

person_lemmas =[t.lemma_ for t in nlp(" ".join(person_units))]
household_lemmas =[t.lemma_ for t in nlp(" ".join(household_units))]

9. Extract quantity:

quantity_1 = top_report[0]
quantity_2 = helper.get_number(excerpt, unit_prediction, person_lemmas, household_lemmas)
quantity_prediction = helper.combine_quantities(quantity_1, quantity_2)

10. Extract location & country:

all_locations = interpreter.extract_countries(interpreter.cleanup(excerpt))
top_location = helper.choose_country(all_locations)

predicted_location = top_location[0]
predicted_country = top_location[1]