forked from shirdrn/document-processor
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathNLPIR.h
More file actions
573 lines (541 loc) · 19.9 KB
/
NLPIR.h
File metadata and controls
573 lines (541 loc) · 19.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
/****************************************************************************
*
* NLPIR Chinese Lexical Analysis System Copyright (c) 2000-2013
* Dr. Kevin Zhang (Hua-Ping Zhang)
* All rights reserved.
*
* This file is the confidential and proprietary property of
* Kevin Zhang and the possession or use of this file requires
* a written license from the author.
* Filename:
* Abstract:
* NLPIR.h: definition of the NLPIR lexical analysis system API
* Author: Kevin Zhang
* Email: pipy_zhang@msn.com kevinzhang@bit.edu.cn
* Weibo: http://weibo.com/drkevinzhang
* Date: 2012-11-14
*
* Notes:
*
****************************************************************************/
#if !defined(__NLPIR_2013_LIB_INCLUDED__)
#define __NLPIR_2013_LIB_INCLUDED__
#ifdef OS_LINUX
#define NLPIR_API
#else
#ifdef NLPIR_EXPORTS
#define NLPIR_API extern "C" __declspec(dllexport)
#else
#define NLPIR_API extern "C" __declspec(dllimport)
#endif
#endif
#if defined(ICTCLAS_JNI_EXPORTS)||defined(KEYEXTRACT_EXPORTS)||defined(NLPIR_JNI_EXPORTS)||defined(LJSUMMARY_EXPORTS)||defined(LJSUMMARY_JNI_EXPORTS)||defined(DOCEXTRACTOR_EXPORTS)
#define NLPIR_API
#endif
//////////////////////////////////////////////////////////////////////////
//
//以下定义为兼容ICTCLAS以前的版本
//
//////////////////////////////////////////////////////////////////////////
#define ICTCLAS_Init NLPIR_Init
#define ICTCLAS_Exit NLPIR_Exit
#define ICTCLAS_ImportUserDict NLPIR_ImportUserDict
#define ICTCLAS_FileProcess NLPIR_FileProcess
#define ICTCLAS_ParagraphProcess NLPIR_ParagraphProcess
#define ICTCLAS_ParagraphProcessA NLPIR_ParagraphProcessA
#define ICTCLAS_GetParagraphProcessAWordCount NLPIR_GetParagraphProcessAWordCount
#define ICTCLAS_ParagraphProcessAW NLPIR_ParagraphProcessAW
#define ICTCLAS_AddUserWord NLPIR_AddUserWord
#define ICTCLAS_SaveTheUsrDic NLPIR_SaveTheUsrDic
#define ICTCLAS_DelUsrWord NLPIR_DelUsrWord
#define ICTCLAS_GetUniProb NLPIR_GetUniProb
#define ICTCLAS_IsWord NLPIR_IsWord
#define ICTCLAS_SetPOSmap NLPIR_SetPOSmap
#define CICTCLAS CNLPIR
#define GetActiveICTCLAS GetActiveInstance
#define POS_MAP_NUMBER 4 //add by qp 2008.11.25
#define ICT_POS_MAP_FIRST 1 //计算所一级标注集
#define ICT_POS_MAP_SECOND 0 //计算所二级标注集
#define PKU_POS_MAP_SECOND 2 //北大二级标注集
#define PKU_POS_MAP_FIRST 3 //北大一级标注集
#define POS_SIZE 40
struct result_t{
int start; //start position,词语在输入句子中的开始位置
int length; //length,词语的长度
char sPOS[POS_SIZE];//word type,词性ID值,可以快速的获取词性表
int iPOS;//词性标注的编号
int word_ID; //该词的内部ID号,如果是未登录词,设成0或者-1
int word_type; //区分用户词典;1,是用户词典中的词;0,非用户词典中的词
int weight;//word weight,read weight
};
#define GBK_CODE 0//默认支持GBK编码
#define UTF8_CODE GBK_CODE+1//UTF8编码
#define BIG5_CODE GBK_CODE+2//BIG5编码
#define GBK_FANTI_CODE GBK_CODE+3//GBK编码,里面包含繁体字
/*********************************************************************
*
* Func Name : Init
*
* Description: Init NLPIR
* The function must be invoked before any operation listed as following
*
* Parameters : const char * sInitDirPath=NULL
* sDataPath: Path where Data directory stored.
* the default value is NULL, it indicates the initial directory is current working directory path
* encode: encoding code;
* sLicenseCode: license code for unlimited usage. common user ignore it
* Returns : success or fail
* Author : Kevin Zhang
* History :
* 1.create 2013-6-8
*********************************************************************/
NLPIR_API bool NLPIR_Init(const char * sDataPath=0,int encode=GBK_CODE,const char*sLicenceCode=0);
/*********************************************************************
*
* Func Name : NLPIR_Exit
*
* Description: Exist NLPIR and free related buffer
* Exit the program and free memory
* The function must be invoked while you needn't any lexical anlysis
*
* Parameters : None
*
* Returns : success or fail
* Author : Kevin Zhang
* History :
* 1.create 2002-8-6
*********************************************************************/
NLPIR_API bool NLPIR_Exit();
/*********************************************************************
*
* Func Name : ParagraphProcessing
*
* Description: Process a paragraph
*
*
* Parameters : sParagraph: The source paragraph
*
* bPOStagged:Judge whether need POS tagging, 0 for no tag;default:1
* i.e. 张华平于1978年3月9日出生于江西省波阳县。
* Result: 张华平/nr 于/p 1978年/t 3月/t 9日/t 出生于/v 江西省/ns 波阳县/ns 。/w
* Returns : the result buffer pointer
*
* Author : Kevin Zhang
* History :
* 1.create 2003-12-22
*********************************************************************/
NLPIR_API const char * NLPIR_ParagraphProcess(const char *sParagraph,int bPOStagged=1);
/*********************************************************************
*
* Func Name : ParagraphProcessingA
*
* Description: Process a paragraph
*
*
* Parameters : sParagraph: The source paragraph
* pResultCount: pointer to result vector size
* Returns : the pointer of result vector, it is managed by system,user cannot alloc and free it
* Author : Kevin Zhang
* History :
* 1.create 2006-10-26
*********************************************************************/
NLPIR_API const result_t * NLPIR_ParagraphProcessA(const char *sParagraph,int *pResultCount,bool bUserDict=true);
/*********************************************************************
*
* Func Name : NLPIR_GetParagraphProcessAWordCount
*
* Description: Get ProcessAWordCount, API for C#
* Get word count and it helps us prepare the proper size buffer for result_t vector
*
* Parameters : sParagraph: The source paragraph
*
* Returns : result vector size
* Author : Kevin Zhang
* History :
* 1.create 2007-3-15
*********************************************************************/
NLPIR_API int NLPIR_GetParagraphProcessAWordCount(const char *sParagraph);
/*********************************************************************
*
* Func Name : NLPIR_ParagraphProcessAW
*
* Description: Process a paragraph, API for C#
*
*
* Parameters : sParagraph: The source paragraph
* result_t * result: pointer to result vector size, it is allocated by the invoker
* Returns : None
* Author :
* History :
* 1.create 2007-3-15
*********************************************************************/
NLPIR_API void NLPIR_ParagraphProcessAW(int nCount,result_t * result);
/*********************************************************************
*
* Func Name : NLPIR_FileProcess
*
* Description: Process a text file
*
*
* Parameters : sSourceFilename: The source file name
* sResultFilename: The result file name
* bPOStagged:Judge whether need POS tagging, 0 for no tag;default:1
* i.e. FileProcess("E:\\Sample\\Corpus_NewPOS\\199802_Org.txt","E:\\Sample\\Corpus_NewPOS\\199802_Org_cla.txt");
* Returns : success:
* fail:
* Author : Kevin Zhang
* History :
* 1.create 2005-11-22
*********************************************************************/
NLPIR_API double NLPIR_FileProcess(const char *sSourceFilename,const char *sResultFilename,int bPOStagged=1);
/*********************************************************************
*
* Func Name : ImportUserDict
*
* Description: Import User-defined dictionary
* Parameters : Text filename for user dictionary
* Returns : The number of lexical entry imported successfully
* Author : Kevin Zhang
* History :
* 1.create 2003-11-28
*********************************************************************/
NLPIR_API unsigned int NLPIR_ImportUserDict(const char *sFilename);
/*********************************************************************
*
* Func Name : NLPIR_AddUserWord
*
* Description: add a word to the user dictionary ,example:你好
* i3s n
*
* Parameters : sFilename: file name
*
* Returns : 1,true ; 0,false
*
* Author :
* History :
* 1.create 11:10:2008
*********************************************************************/
NLPIR_API int NLPIR_AddUserWord(const char *sWord);//add by qp 2008.11.10
/*********************************************************************
*
* Func Name : Save
*
* Description: Save dictionary to file
*
* Parameters :
*
* Returns : 1,true; 2,false
*
* Author :
* History :
* 1.create 11:10:2008
*********************************************************************/
NLPIR_API int NLPIR_SaveTheUsrDic();
/*********************************************************************
*
* Func Name : NLPIR_DelUsrWord
*
* Description: delete a word from the user dictionary
*
* Parameters :
* Returns : -1, the word not exist in the user dictionary; else, the handle of the word deleted
*
* Author :
* History :
* 1.create 11:10:2008
*********************************************************************/
NLPIR_API int NLPIR_DelUsrWord(const char *sWord);
/*********************************************************************
*
* Func Name : NLPIR_GetUniProb
*
* Description: Get Unigram Probability
*
*
* Parameters : sSourceFilename: The source file name
* sResultFilename: The result file name
* i.e. FileProcess("E:\\Sample\\Corpus_NewPOS\\199802_Org.txt","E:\\Sample\\Corpus_NewPOS\\199802_Org_cla.txt");
* Returns : success:
* fail:
* Author : Kevin Zhang
* History :
* 1.create 2005-11-22
*********************************************************************/
NLPIR_API double NLPIR_GetUniProb(const char *sWord);
/*********************************************************************
*
* Func Name : NLPIR_IsWord
*
* Description: Get Unigram Probability
*
*
* Parameters : sSourceFilename: The source file name
* sResultFilename: The result file name
* i.e. FileProcess("E:\\Sample\\Corpus_NewPOS\\199802_Org.txt","E:\\Sample\\Corpus_NewPOS\\199802_Org_cla.txt");
* Returns : success:
* fail:
* Author : Kevin Zhang
* History :
* 1.create 2005-11-22
*********************************************************************/
NLPIR_API bool NLPIR_IsWord(const char *sWord);
/*********************************************************************
*
* Func Name : NLPIR_GetKeyWords
*
* Description: Extract keyword from sLine
*
* Parameters : sLine, the input paragraph
bArguOut,whether the keyword weight output
nMaxKeyLimt:maximum of key words, up to 50
* Returns : keywords list like:
* "科学发展观 宏观经济 " or
"科学发展观 23.80 宏观经济 12.20" with weight
*
* Author :
* History :
* 1.create 2012/11/12
*********************************************************************/
NLPIR_API const char * NLPIR_GetKeyWords(const char *sLine,int nMaxKeyLimit=50,bool bWeightOut=false);
/*********************************************************************
*
* Func Name : NLPIR_GetFileKeyWords
*
* Description: Extract keyword from a text file
*
* Parameters : sFilename, the input text file name
bArguOut,whether the keyword weight output
nMaxKeyLimt:maximum of key words, up to 50
* Returns : keywords list like:
* "科学发展观 宏观经济 " or
"科学发展观 23.80 宏观经济 12.20" with weight
*
* Author :
* History :
* 1.create 2012/11/12
*********************************************************************/
NLPIR_API const char * NLPIR_GetFileKeyWords(const char *sFilename,int nMaxKeyLimit=50,bool bWeightOut=false);
/*********************************************************************
*
* Func Name : NLPIR_GetNewWords
*
* Description: Extract New words from sLine
*
* Parameters : sLine, the input paragraph
bArguOut,whether the keyword weight output
nMaxKeyLimt:maximum of key words, up to 50
* Returns : new words list like:
* "科学发展观 屌丝 "or
"科学发展观 23.80 屌丝 12.20" with weight
*
* Author :
* History :
* 1.create 2012/11/12
*********************************************************************/
NLPIR_API const char * NLPIR_GetNewWords(const char *sLine,int nMaxKeyLimit=50,bool bWeightOut=false);
/*********************************************************************
*
* Func Name : NLPIR_GetFileNewWords
*
* Description: Extract new words from a text file
*
* Parameters : sFilename, the input text file name
bArguOut,whether the keyword weight output
nMaxKeyLimt:maximum of key words, up to 50
* Returns : keywords list like:
* "科学发展观 宏观经济 " or
"科学发展观 23.80 宏观经济 12.20" with weight
*
* Author :
* History :
* 1.create 2012/11/12
*********************************************************************/
NLPIR_API const char * NLPIR_GetFileNewWords(const char *sFilename,int nMaxKeyLimit=50,bool bWeightOut=false);
/*********************************************************************
*
* Func Name : NLPIR_FingerPrint
*
* Description: Extract a finger print from the paragraph
*
* Parameters :
* Returns : 0, failed; else, the finger print of the content
*
* Author :
* History :
* 1.create 11:10:2008
*********************************************************************/
NLPIR_API unsigned long NLPIR_FingerPrint(const char *sLine);
/*********************************************************************
*
* Func Name : NLPIR_SetPOSmap
*
* Description: select which pos map will use
*
* Parameters :nPOSmap, ICT_POS_MAP_FIRST 计算所一级标注集
ICT_POS_MAP_SECOND 计算所二级标注集
PKU_POS_MAP_SECOND 北大二级标注集
PKU_POS_MAP_FIRST 北大一级标注集
* Returns : 0, failed; else, success
*
* Author :
* History :
* 1.create 11:10:2008
*********************************************************************/
NLPIR_API int NLPIR_SetPOSmap(int nPOSmap);
/*********************************************************************
*
* class CNLPIR
* 描述:
* NLPIR 类,使用之前必须调用NLPIR_Init(),退出必须调用NLPIR_Exit
* 在使用过程中可以使用多份CNLPIR,支持多线程分词处理
* 每个线程先调用GetActiveInstance,获取处理类,然后,设置SetAvailable(false)宣示线程主权,
* 处理完成后,SetAvailable(true)释放线程主权
* History :
* 1.create 2005-11-10
*********************************************************************/
#ifdef OS_LINUX
class CNLPIR {
#else
class __declspec(dllexport) CNLPIR {
#endif
public:
CNLPIR();
~CNLPIR();
double FileProcess(const char *sSourceFilename,const char *sResultFilename,int bPOStagged=1);
//Process a file,类似于C下的NLPIR_FileProcess
const char * ParagraphProcess(const char *sLine,int bPOStagged=1);
//Process a file,类似于C下的NLPIR_ParagraphProcess
const result_t * ParagraphProcessA(const char *sParagraph,int *pResultCount,bool bUserDict=true);
//Process a file,类似于C下的NLPIR_ParagraphProcessA
void ParagraphProcessAW(int nCount,result_t * result);
int GetParagraphProcessAWordCount(const char *sParagraph);
const char * GetKeyWords(const char *sLine,int nMaxKeyLimit,bool bWeightOut);
//获取关键词
const char * GetFileKeyWords(const char *sFilename,int nMaxKeyLimit,bool bWeightOut);
//从文本文件中获取关键词
const char * GetNewWords(const char *sFilename,int nMaxKeyLimit,bool bWeightOut);
//获取新词
const char * GetFileNewWords(const char *sFilename,int nMaxKeyLimit,bool bWeightOut);
//从文本文件中获取新词
bool SetAvailable(bool bAvailable=true);//当前线程释放该类,可为下一个线程使用
bool IsAvailable();//判断当前分词器是否被线程占用
unsigned int GetHandle()
{
return m_nHandle;
}
private:
unsigned int m_nHandle;//该成员作为该类的Handle值,由系统自动分配,用户不可修改
bool m_bAvailable;//该成员作为多线程共享控制的参数,由系统自动分配,用户不可修改
};
/*********************************************************************
*
* Func Name : GetActiveInstance
*
* Description: 获取可用的CNLPIR类,适用于多线程开发,先获取可用的CNLP,再调用其中的功能
*
* Parameters : None
* Returns : CNLPIR*
*
* Author : Kevin Zhang
* History :
* 1.create 1:10:2012
*********************************************************************/
NLPIR_API CNLPIR* GetActiveInstance();
/*********************************************************************
*
* 以下函数为2013版本专门针对新词发现的过程,一般建议脱机实现,不宜在线处理
* 新词识别完成后,再自动导入到分词系统中,即可完成
* 函数以NLPIR_NWI(New Word Identification)开头
*********************************************************************/
/*********************************************************************
*
* Func Name : NLPIR_NWI_Start
*
* Description: 启动新词识别
*
* Parameters : None
* Returns : bool, true:success, false:fail
*
* Author : Kevin Zhang
* History :
* 1.create 2012/11/23
*********************************************************************/
NLPIR_API bool NLPIR_NWI_Start();//New Word Indentification Start
/*********************************************************************
*
* Func Name : NLPIR_NWI_AddFile
*
* Description: 往新词识别系统中添加待识别新词的文本文件
* 需要在运行NLPIR_NWI_Start()之后,才有效
*
* Parameters : const char *sFilename:文件名
* Returns : bool, true:success, false:fail
*
* Author : Kevin Zhang
* History :
* 1.create 2012/11/23
*********************************************************************/
NLPIR_API int NLPIR_NWI_AddFile(const char *sFilename);
/*********************************************************************
*
* Func Name : NLPIR_NWI_AddMem
*
* Description: 往新词识别系统中添加一段待识别新词的内存
* 需要在运行NLPIR_NWI_Start()之后,才有效
*
* Parameters : const char *sFilename:文件名
* Returns : bool, true:success, false:fail
*
* Author : Kevin Zhang
* History :
* 1.create 2012/11/23
*********************************************************************/
NLPIR_API bool NLPIR_NWI_AddMem(const char *sText);
/*********************************************************************
*
* Func Name : NLPIR_NWI_Complete
*
* Description: 新词识别添加内容结束
* 需要在运行NLPIR_NWI_Start()之后,才有效
*
* Parameters : None
* Returns : bool, true:success, false:fail
*
* Author : Kevin Zhang
* History :
* 1.create 2012/11/23
*********************************************************************/
NLPIR_API bool NLPIR_NWI_Complete();//新词
/*********************************************************************
*
* Func Name : NLPIR_NWI_GetResult
*
* Description: 获取新词识别的结果
* 需要在运行NLPIR_NWI_Complete()之后,才有效
*
* Parameters : bWeightOut:是否需要输出每个新词的权重参数
*
* Returns : 输出格式为
* 【新词1】 【权重1】 【新词2】 【权重2】 ...
*
* Author : Kevin Zhang
* History :
* 1.create 2012/11/23
*********************************************************************/
NLPIR_API const char * NLPIR_NWI_GetResult(bool bWeightOut=false);//输出新词识别结果
/*********************************************************************
*
* Func Name : NLPIR_NWI_Result2UserDict
*
* Description: 将新词识别结果导入到用户词典中
* 需要在运行NLPIR_NWI_Complete()之后,才有效
* 如果需要将新词结果永久保存,建议在执行NLPIR_SaveTheUsrDic
* Parameters : None
* Returns : bool, true:success, false:fail
*
* Author : Kevin Zhang
* History :
* 1.create 2012/11/23
*********************************************************************/
NLPIR_API unsigned int NLPIR_NWI_Result2UserDict();//新词识别结果转为用户词典,返回新词结果数目
#endif//#define __NLPIR_LIB_INCLUDED__