forked from lionsoul2014/jcseg
-
Notifications
You must be signed in to change notification settings - Fork 1
/
jcseg-server.properties
207 lines (169 loc) · 6.38 KB
/
jcseg-server.properties
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# jcseg server configuration file with standard json syntax
# question or report to chenxin<[email protected]>
{
# jcseg server configuration
"server_config": {
# server port
"port": 1990,
# default conmunication charset
"charset": "utf-8",
# http idle timeout in ms
"http_connection_idle_timeout": 60000,
# jetty maximum thread pool size
"max_thread_pool_size": 200,
# thread idle timeout in ms
"thread_idle_timeout": 30000,
# http output buffer size
"http_output_buffer_size": 32768,
# request header size
"http_request_header_size": 8192,
# response header size
"http_response_header_size": 8192
},
# global setting for jcseg, yet another copy of the old
# configuration file jcseg.properties
"jcseg_global_config": {
# maximum match length. (5-7)
"jcseg_maxlen": 7,
# recognized the chinese name.
# (true to open and false to close it)
"jcseg_icnname": true,
# maximum length for pair punctuation text.
# set it to 0 to close this function
"jcseg_pptmaxlen": 7,
# maximum length for chinese last name andron.
"jcseg_cnmaxlnadron": 1,
# whether to clear the stopwords.
# (set true to clear stopwords and false to close it)
"jcseg_clearstopword": false,
# whether to convert the chinese numeric to arabic number.
# (set to true open it and false to close it) like '\u4E09\u4E07' to 30000.
"jcseg_cnnumtoarabic": true,
# whether to convert the chinese fraction to arabic fraction.
# @Note: for lucene,solr,elasticsearch eg.. close it.
"jcseg_cnfratoarabic": false,
# whether to keep the unrecognized word.
# (set true to keep unrecognized word and false to clear it)
"jcseg_keepunregword": true,
# whether to start the secondary segmentation for the complex english words.
"jcseg_ensencondseg": true,
# min length of the secondary simple token.
# (better larger than 1)
"jcseg_stokenminlen": 2,
#thrshold for chinese name recognize.
# better not change it before you know what you are doing.
"jcseg_nsthreshold": 1000000,
#The punctuations that will be keep in an token.
# (Not the end of the token).
"jcseg_keeppunctuations": "@#%.&+"
},
# dictionary instance setting.
# add yours here with standard json syntax
"jcseg_dict": {
"master": {
# set to null to load the lexicons from the classpath
path: null,
#"path": [
# "{jar.dir}/lexicon"
# # absolute path here
# # "/java/JavaSE/jcseg/lexicon"
#],
# wether to load the part of speech of the words
"loadpos": true,
# whether to load the pinyin of the words.
"loadpinyin": true,
# whether to load the synoyms words of the words.
"loadsyn": true,
# whether to load the entity of the words.
"loadentity": true,
# whether to load the modified lexicon file auto.
"autoload": true,
# Poll time for auto load. (in seconds)
"polltime": 300
}
# add more of yours here
# ,"name" : {
# "path": [
# "absolute jcseg standard lexicon path 1",
# "absolute jcseg standard lexicon path 2"
# ...
# ],
# "autoload": 0,
# "polltime": 300
# }
},
# JcsegTaskConfig instance setting.
# @Note:
# All the config instance here is extends from the global_setting above.
# do nothing will extends all the setting from global_setting
"jcseg_config": {
"master": {
# extends and Override the global setting
"jcseg_pptmaxlen": 0,
"jcseg_cnfratoarabic": true,
"jcseg_keepunregword": false
}
# this one is for keywords,keyphrase,sentence,summary extract
# @Note: do not delete this instance if u want jcseg to
# offset u extractor service
,"extractor": {
"jcseg_pptmaxlen": 0,
"jcseg_clearstopword": true,
"jcseg_cnnumtoarabic": false,
"jcseg_cnfratoarabic": false,
"jcseg_keepunregword": false,
"jcseg_ensencondseg": false
}
# well, this one is for NLP only
,"nlp" : {
"jcseg_ensencondseg": false,
"jcseg_cnfratoarabic": true,
"jcseg_cnnumtoarabic": true
}
# add more of yours here
# ,"name": {
# ...
# }
},
# jcseg tokenizer instance setting.
# Your could let the instance service for you by access:
# http://jcseg_server_host:port/tokenizer/instance_name
# instance_name is the name of instance you define here.
"jcseg_tokenizer": {
"master": {
# jcseg tokenizer algorithm, could be:
# 1: SIMPLE_MODE
# 2: COMPLEX_MODE
# 3: DETECT_MODE
# 4: SEARCH_MODE
# 5: DELIMITER_MODE
# 6: NLP_MODE
# see org.lionsoul.jcseg.tokenizer.core.JcsegTaskConfig for more info
"algorithm": 2,
# dictionary instance name
# choose one of your defines above in the dict scope
"dict": "master",
# JcsegTaskConfig instance name
# choose one of your defines above in the config scope
"config": "master"
}
# this tokenizer instance is for extractor service
# do not delete it if you want jcseg to offset you extractor service
,"extractor": {
"algorithm": 2,
"dict": "master",
"config": "extractor"
}
# this tokenizer instance of for NLP analysis
# keep it for you NLP project
,"nlp" : {
"algorithm": 6,
"dict": "master",
"config": "nlp"
}
# add more of your here
# ,"name": {
# ...
# }
}
}