forked from LennyLeng/SOC_Sankey_Generator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathetl.py
190 lines (156 loc) · 6.4 KB
/
etl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#coding=utf8
import pandas as pd
import json
import re
def get_filter(csv_file):
filter_list = []
data = pd.read_csv(csv_file, encoding='gbk', header=None)
for data_row in data.values:
if (data_row[0].startswith('过滤模式') or data_row[0].startswith('#')):
continue
dict = {}
dict['filter_type'] = data_row[0]
dict['col_index'] = int(data_row[1])
dict['pattern_val'] = data_row[2]
dict['tip'] = data_row[3]
filter_list.append(dict)
return filter_list
def generate_json(csv_file, limit, filter_list):
print('开始抽取整理数据...')
# 读取csv文件
data = pd.read_csv(csv_file, encoding='gbk', header=None)
# 生成源地址集合,根据filter排除
i0_set_tmp = []
i0_set_tmp.extend(data[0].unique())
if '源地址' in i0_set_tmp:
i0_set_tmp.remove('源地址')
for i0_tmp in i0_set_tmp:
for filter in filter_list:
try:
if (filter['filter_type'] == 'ex' and filter['col_index'] == 0 and re.search(filter['pattern_val'], i0_tmp)):
i0_set_tmp.remove(i0_tmp)
break
if (filter['filter_type'] == 'in' and filter['col_index'] == 0 and not re.search(filter['pattern_val'], i0_tmp)):
i0_set_tmp.remove(i0_tmp)
break
except:
pass
# for filter in filter_list:
# if(filter['filter_type'] == 'ex' and str(filter['col_index']) == '0'):
# if filter['pattern_val'] in i0_set_tmp:
# i0_set_tmp.remove(filter['pattern_val'])
links_tmp = {}
nodes_tmp = []
links = []
nodes = []
# 生成links,nodes_tmp
i = 0
for data_row in data.values:
# data_row[0] => 源地址
# data_row[1] => 目的地址
# data_row[2].replace(" ", "") => 事件名称
# data_row[3] => 事件数
if (limit == '' or i < int(limit)):
# 跳过表头
if (data_row[0] == '源地址' or data_row[1] == '目的地址' or data_row[2].replace(" ", "") == '事件名称'):
continue
#处理数量千分占位符
try:
data_row[3] = int(data_row[3].replace(',', ''))
except:
pass
data_row[2] = data_row[2].replace(" ", "")
#filter list policy 开始
is_exclude = False
for filter in filter_list:
try:
if(filter['filter_type'] == 'ex' and re.search(filter['pattern_val'], data_row[filter['col_index']])):
print(data_row[0], '\t=>\t', data_row[2][:10] + '…', '\t=>\t', data_row[1], '\t|\t',str(data_row[3]),
'\t',filter['tip'],'\t忽略')
is_exclude = True
break
if (filter['filter_type'] == 'in' and not re.search(filter['pattern_val'],data_row[filter['col_index']])):
print(data_row[0], '\t=>\t', data_row[2][:10] + '…', '\t=>\t', data_row[1], '\t|\t',
str(data_row[3]),
'\t', '未'+filter['tip'], '\t忽略')
is_exclude = True
break
except:
pass
if(is_exclude):
continue
# filter list policy 结束
if (data_row[2] == '' or data_row[2] == '空值'):
print(data_row[0], '\t=>\t', data_row[2][:10] + '…', '\t=>\t', data_row[1], '\t|\t', str(data_row[3]),
'\t事件名称为空\t修正')
data_row[2] = '空名称'
if (data_row[0] == data_row[1] or data_row[1] == '空值'):
print(data_row[0], '\t=>\t', data_row[2][:10] + '…', '\t=>\t', data_row[1], '\t|\t', str(data_row[3]),
'\t源目相同或者目的为空\t修正')
data_row[1] = '0.0.0.0'
if (data_row[0] == '空值'):
print(data_row[0], '\t=>\t', data_row[2][:10] + '…', '\t=>\t', data_row[1], '\t|\t', str(data_row[3]),
'\t源目为空\t修正')
data_row[0] = '0.0.0.0'
if (data_row[1] in i0_set_tmp):
print(data_row[0], '\t=>\t', data_row[2][:10] + '…', '\t=>\t', data_row[1], '\t|\t', str(data_row[3]),
'\t出现打环情况\t忽略')
continue
nodes_tmp.append(data_row[0])
nodes_tmp.append(data_row[1])
nodes_tmp.append(data_row[2])
tmp = list()
tmp.append(data_row[0])
tmp.append(data_row[2])
tmp = json.dumps(tmp)
if (tmp in links_tmp):
links_tmp[tmp] += data_row[3]
else:
links_tmp[tmp] = data_row[3]
tmp = list()
tmp.append(data_row[2])
tmp.append(data_row[1])
tmp = json.dumps(tmp)
if (tmp in links_tmp):
links_tmp[tmp] += data_row[3]
else:
links_tmp[tmp] = data_row[3]
else:
#print(data_row[0], data_row[1],data_row[2],data_row[3], '!!!')
break
i += 1
# nodes_tmp去重
nodes_tmp = list(set(nodes_tmp))
# 处理nodes_tmp中表头
if '源地址' in i0_set_tmp:
i0_set_tmp.remove('源地址')
if '目的地址' in i0_set_tmp:
i0_set_tmp.remove('目的地址')
if '事件名称' in i0_set_tmp:
i0_set_tmp.remove('事件名称')
if '事件数' in i0_set_tmp:
i0_set_tmp.remove('事件数')
# 处理nodes_tmp中空值情况
if '空值' in nodes_tmp:
nodes_tmp.remove('空值')
nodes_tmp.append('0.0.0.0')
# 生成nodes
for data_row in nodes_tmp:
dic = {}
dic['name'] = data_row
nodes.append(dic)
for data_row in links_tmp:
tmp = json.loads(data_row)
dic = {}
dic['source'] = tmp[0]
dic['target'] = tmp[1]
dic['value'] = links_tmp[data_row]
links.append(dic)
data = {'nodes': nodes, 'links': links}
print('抽取整理数据完成!')
return json.dumps(data)
if __name__ == '__main__':
filter_list = get_filter('conf/filter.csv')
json_data = generate_json('csv/test.csv', '', filter_list)
with open('web/tmp.json', 'w') as f:
f.write(json_data)