-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathclassifier_twitter.py
90 lines (70 loc) · 2.81 KB
/
classifier_twitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
"""
Using Classifier and Twitter Streaming Loader
=============================================
This example illustrates how to train/classify tweets from Twitter streams.
To run this example, ``tweepy`` and ``jq`` package is required.
You can install them by ``pip install tweepy jq``.
"""
from jubakit.classifier import Classifier, Schema, Dataset, Config
from jubakit.loader.twitter import TwitterStreamLoader, TwitterOAuthHandler
def get_loader():
# Creates a Twitter stream loader.
# Fill in your keys here; you can get keys at: https://apps.twitter.com/
return TwitterStreamLoader(TwitterOAuthHandler(
consumer_key='XXXXXXXXXXXXXXXXXXXX',
consumer_secret='XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX',
access_token='XXXXXXXX-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX',
access_secret='XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX',
))
# Define a Schema.
schema = Schema({
'.lang': Schema.LABEL,
'.text': Schema.STRING,
'.user.lang': Schema.STRING,
'.user.description': Schema.STRING,
}, Schema.IGNORE)
# Create a Classifier Service.
classifier = Classifier.run(Config())
# Number of tweets used for training.
n_train = 1000
print('---- Train: {0} tweets -------------------------------------'.format(n_train))
# Train the classifier using tweets from Twitter stream.
trained_labels = set()
dataset = Dataset(get_loader(), schema)
for (idx, label) in classifier.train(dataset):
if idx == n_train: break
trained_labels.add(label)
text_summary = dataset.get(idx)['.text'].replace('\n', '')
print('Train[{0}]: language {1} >> {2}'.format(idx, label, text_summary))
print('Languages Trained: {0}'.format(str(trained_labels)))
print('---- Prediction (Ctrl-C to stop) -------------------------------------')
try:
# Classify tweets using the classifier.
(y_true, y_pred) = ([], [])
dataset = Dataset(get_loader(), schema)
for (idx, label, result) in classifier.classify(dataset):
(true_lang, pred_lang) = (label, result[0][0])
text_summary = dataset.get(idx)['.text'].replace('\n', '')
message = None
if pred_lang == true_lang:
message = 'correct!'
elif true_lang in trained_labels:
message = 'incorrect'
else:
# The correct language is what we haven't trained.
message = 'not-trained'
print("Classify[{0}]: {1} (predicted = {2} | actual = {3}) >> {4}".format(idx, message, pred_lang, true_lang, text_summary))
if true_lang in trained_labels:
y_true.append(true_lang)
y_pred.append(pred_lang)
except KeyboardInterrupt:
pass # Trap Ctrl-C
try:
# If scikit-learn is available, display metrics.
import sklearn.metrics
print(sklearn.metrics.classification_report(y_true, y_pred))
except ImportError:
pass