-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
121 lines (100 loc) · 3.62 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# -*- coding: utf-8 -*-
import cv2
import numpy as np
import json
import collections
#set here number of clusters you want to get
CLUSTERS_COUNT = 2
#the length of feature vector. Less representative features will be omitted
FEATURES_NUM = 1000
#for debug or memory saving number of loaded tweets can be limited by that parameter
TOTAL_TWEETS = 50000
FILE_IN = 'data/in/tweets.json'
FILE_OUT = 'data/out/clustered.html'
#creating character N-gram with length 4
def getCharNGramm(word, num=4):
#if length of word is less then 4, let's return entire word
if len(word)<=num:
return [word]
iterations = len(word)-num + 1
res = []
for i in range(iterations):
ng = word[i:(num+i)]
res.append( ng )
#so we extracted all character 4-gram, for example for hash #abcde it will return ['abcd', 'bcde']
return res
#extracting hashes of tweets and tweets itself
def getHashTweets():
tweets = []
tweets_hash = []
tweets_hash_N = []
#start reading json tweets
file_tweets = open(FILE_IN, 'r')
for line in file_tweets:
tweet = json.loads(line)
#if more then 3 hash tags, we consider it as spam
if len(tweet[u'entities'][u'hashtags']) > 3:
continue
#we do not consider tweets with no hash tags at all
if len(tweet[u'entities'][u'hashtags']) > 0:
hashes = []
hashesN = []
for el in tweet[u'entities'][u'hashtags']:
hashes.append(el[u'text'])
#making "N-gramm hashes" from full hash tag
hashesTmp = getCharNGramm(el[u'text'])
for h in hashesTmp:
hashesN.append( h )
tweets_hash.append(hashes)
tweets_hash_N.append(hashesN)
tweets.append(tweet)
if len( tweets ) > TOTAL_TWEETS:
break
file_tweets.close()
return tweets, tweets_hash, tweets_hash_N
#creating dictionary of all features
def get_dict(vectors):
words_all = []
for hashes in vectors:
for hash in hashes:
words_all.append(hash)
x=collections.Counter(words_all)
words = [elt for elt, count in x.most_common(FEATURES_NUM)]
return words
#making feature vector for every tweet
def get_vector(hashes, dict):
res_vector = []
for k, d in enumerate(dict):
res_vector.append(0)
for hash in hashes:
if hash == d:
res_vector[k] += 1
return res_vector
#1: loading tweets
tweets, tweets_hash, tweets_hash_N = getHashTweets()
print('total tweets for analyse = ' + str(len(tweets)))
#2: extracting dictionary of features
dict = get_dict(tweets_hash_N)
print('dictionary length ' + str( len(dict) ))
#3: creating feature vector for every tweet
tweets_vectors = []
for hashes in tweets_hash_N:
tweets_vectors.append( get_vector(hashes, dict) )
print('vectors created, clustering is starting now!')
#4: clustering with K-means
tweets_vectors = np.float32(tweets_vectors)
criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
ret,label,center = cv2.kmeans(tweets_vectors, CLUSTERS_COUNT, criteria, 10, cv2.KMEANS_RANDOM_CENTERS)
res_bin = label.flatten()
#5: creating out file, html formatted
result = [None]*CLUSTERS_COUNT
for k, el in enumerate( res_bin ):
if result[el] is None:
result[el] = []
result[el].append( tweets[k][u'text'].encode('utf-8') )
with open(FILE_OUT, 'w') as outfile:
for cluster in result:
outfile.write("<h3>New Cluster ("+str(len(cluster))+")</h3>" + "\n")
for tt in cluster:
outfile.write( "<p>" + str(tt) + "</p>\n" )
print('All done. Result in file '+FILE_OUT)