jcseg-server.properties

# jcseg server configuration file with standard json syntax
# question or report to chenxin<chenxin619315@gmail.com>
{
    # jcseg server configuration
    "server_config": {
        # server port
        "port": 1990,
        
        # default conmunication charset
        "charset": "utf-8",
        
        # http idle timeout in ms
        "http_connection_idle_timeout": 60000,
        
        # jetty maximum thread pool size
        "max_thread_pool_size": 200,
        
        # thread idle timeout in ms
        "thread_idle_timeout": 30000,
        
        # http output buffer size
        "http_output_buffer_size": 32768,
        
        # request header size
        "http_request_header_size": 8192,
        
        # response header size
        "http_response_header_size": 8192
    },
    
    
    # global setting for jcseg, yet another copy of the old 
    # configuration file jcseg.properties
    "jcseg_global_config": {
        # maximum match length. (5-7)
        "jcseg_maxlen": 7,
        
        # recognized the chinese name.
        # (true to open and false to close it)
        "jcseg_icnname": true,
        
        # maximum length for pair punctuation text.
        # set it to 0 to close this function
        "jcseg_pptmaxlen": 7,
        
        # maximum length for chinese last name andron.
        "jcseg_cnmaxlnadron": 1,
        
        # whether to clear the stopwords.
        # (set true to clear stopwords and false to close it)
        "jcseg_clearstopword": false,
        
        # whether to convert the chinese numeric to arabic number. 
        # (set to true open it and false to close it) like '\u4E09\u4E07' to 30000.
        "jcseg_cnnumtoarabic": true,
        
        # whether to convert the chinese fraction to arabic fraction.
        # @Note: for lucene,solr,elasticsearch eg.. close it.
        "jcseg_cnfratoarabic": false,
        
        # whether to keep the unrecognized word. 
        # (set true to keep unrecognized word and false to clear it)
        "jcseg_keepunregword": true,
        
        # whether to start the secondary segmentation for the complex english words.
        "jcseg_ensencondseg": true,
        
        # min length of the secondary simple token. 
        # (better larger than 1)
        "jcseg_stokenminlen": 2,
        
        #thrshold for chinese name recognize.
        # better not change it before you know what you are doing.
        "jcseg_nsthreshold": 1000000,
        
        #The punctuations that will be keep in an token.
        # (Not the end of the token).
        "jcseg_keeppunctuations": "@#%.&+"
    },
    
    # dictionary instance setting.
    # add yours here with standard json syntax
    "jcseg_dict": {
        "master": {
            # set to null to load the lexicons from the classpath
            path: null,
            #"path": [
            #    "{jar.dir}/lexicon"
            #    # absolute path here
            #    # "/java/JavaSE/jcseg/lexicon"
            #],
            
            # wether to load the part of speech of the words
            "loadpos": true,

			# whether to load the pinyin of the words.
			"loadpinyin": true,

			# whether to load the synoyms words of the words.
			"loadsyn": true,

			# whether to load the entity of the words.
            "loadentity": true,
                    
            # whether to load the modified lexicon file auto.
            "autoload": true,
        
            # Poll time for auto load. (in seconds)
            "polltime": 300
        }
        
        # add more of yours here
        # ,"name" : {
        #   "path": [
        #       "absolute jcseg standard lexicon path 1",
        #       "absolute jcseg standard lexicon path 2"
        #       ...
        #   ],
        #   "autoload": 0,
        #   "polltime": 300
        # }
    },
    
    # JcsegTaskConfig instance setting.
    # @Note: 
    # All the config instance here is extends from the global_setting above.
    # do nothing will extends all the setting from global_setting
    "jcseg_config": {
        "master": {
            # extends and Override the global setting
            "jcseg_pptmaxlen": 0,
            "jcseg_cnfratoarabic": true,
            "jcseg_keepunregword": false
        }
        
        # this one is for keywords,keyphrase,sentence,summary extract
        # @Note: do not delete this instance if u want jcseg to
        # offset u extractor service
        ,"extractor": {
            "jcseg_pptmaxlen": 0,
            "jcseg_clearstopword": true,
            "jcseg_cnnumtoarabic": false,
            "jcseg_cnfratoarabic": false,
            "jcseg_keepunregword": false,
            "jcseg_ensencondseg": false
        }

        # well, this one is for NLP only
        ,"nlp" : {
            "jcseg_ensencondseg": false,
            "jcseg_cnfratoarabic": true,
            "jcseg_cnnumtoarabic": true
        }
        
        # add more of yours here
        # ,"name": {
        #   ...
        # }
    },
    
    # jcseg tokenizer instance setting.
    # Your could let the instance service for you by access:
    # http://jcseg_server_host:port/tokenizer/instance_name
    # instance_name is the name of instance you define here.
    "jcseg_tokenizer": {
        "master": {
            # jcseg tokenizer algorithm, could be:
            # 1: SIMPLE_MODE
            # 2: COMPLEX_MODE
            # 3: DETECT_MODE
            # 4: SEARCH_MODE
            # 5: DELIMITER_MODE
            # 6: NLP_MODE
            # see org.lionsoul.jcseg.tokenizer.core.JcsegTaskConfig for more info
            "algorithm": 2,
            
            # dictionary instance name
            # choose one of your defines above in the dict scope
            "dict": "master",
            
            # JcsegTaskConfig instance name
            # choose one of your defines above in the config scope
            "config": "master"
        }
        
        # this tokenizer instance is for extractor service
        # do not delete it if you want jcseg to offset you extractor service
        ,"extractor": {
            "algorithm": 2,
            "dict": "master",
            "config": "extractor"
        }

        # this tokenizer instance of for NLP analysis
        # keep it for you NLP project
        ,"nlp" : {
            "algorithm": 6,
            "dict": "master",
            "config": "nlp"
        }
        
        # add more of your here
        # ,"name": {
        #   ...
        # }
    }
}