99e344df25a6fadeb3604d0386776f78f11cae5b
[pnews.git] / war / src / main / java / pnews / OpenNLP.java
1 package pnews;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.util.List;
6 import java.util.logging.Logger;
7
8 import opennlp.tools.namefind.NameFinderME;
9 import opennlp.tools.namefind.TokenNameFinderModel;
10 import opennlp.tools.tokenize.TokenizerME;
11 import opennlp.tools.tokenize.TokenizerModel;
12 import opennlp.tools.util.Span;
13 import pnews.servlet.Config;
14
15 /** http://www.devglan.com/artificial-intelligence/opennlp-named-entity-recognition-example **/
16 public class OpenNLP {
17         private static final String CLASS_NAME = OpenNLP.class.getName();
18         private static final Logger LOG = Logger.getLogger(CLASS_NAME); 
19         private static TokenNameFinderModel organizationModel;
20         private static TokenNameFinderModel personModel;
21         private static TokenNameFinderModel locationModel;
22         private static TokenizerModel tokenModel;
23
24         public static List<String> classify(String str, List<String> entities, Config config) throws IOException {
25                 classify(str, getOrganizationModel(), entities, config);
26                 classify(str, getPersonModel(), entities, config);
27                 classify(str, getLocationModel(), entities, config);
28                 
29                 return entities;
30         }
31         
32         private static TokenNameFinderModel getOrganizationModel() throws IOException {
33                 synchronized (OpenNLP.class) {
34                         if (organizationModel == null) {
35                                 InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-organization.bin");
36                                 organizationModel = new TokenNameFinderModel(inputStream);
37                         }
38                 }
39                 
40                 return organizationModel;
41         }
42
43         private static TokenNameFinderModel getPersonModel() throws IOException {
44                 synchronized (OpenNLP.class) {
45                         if (personModel == null) {
46                                 InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-person.bin");
47                                 personModel = new TokenNameFinderModel(inputStream);
48                         }
49                 }
50                 
51                 return personModel;
52         }
53
54         private static TokenNameFinderModel getLocationModel() throws IOException {
55                 synchronized (OpenNLP.class) {
56                         if (locationModel == null) {
57                                 InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-location.bin");
58                                 locationModel = new TokenNameFinderModel(inputStream);
59                         }
60                 }
61                 
62                 return locationModel;
63         }
64         
65         private static List<String> classify(String str, TokenNameFinderModel model, List<String> entities, Config config) throws IOException {
66                 String entity;
67                 
68                 NameFinderME nameFinder = new NameFinderME(model);
69                 String[] tokens = tokenize(str);
70                 Span nameSpans[] = nameFinder.find(tokens);
71                 
72                 for(Span s: nameSpans)  {
73                         if (s.getProb() < 0.60)
74                                 continue ;
75                         
76                         entity = null;
77                         for (int i = s.getStart(); i < s.getEnd(); i++)
78                                 if (entity == null)
79                                         entity = tokens[i];
80                                 else
81                                         entity += " " + tokens[i];
82                                 
83                         LOG.finest(entity + " " + s.getProb() + " " + s.toString());
84                         if (!config.isBlacklistedEntity(entity) && !entities.contains(entity))
85                                 entities.add(config.getEntityAlias(entity));
86                 }
87                 
88                 return entities;
89         } 
90
91         public static String[] tokenize(String sentence) throws IOException { 
92                 synchronized (OpenNLP.class) {
93                         if (tokenModel == null) {
94                                 InputStream inputStreamTokenizer = OpenNLP.class.getResourceAsStream("/en-token.bin"); 
95                                 tokenModel = new TokenizerModel(inputStreamTokenizer);
96                         }
97                 }
98                 TokenizerME tokenizer = new TokenizerME(tokenModel);
99                 return tokenizer.tokenize(sentence);
100         }
101 }