added support of entity aliases
[pnews.git] / war / src / main / java / pnews / OpenNLP.java
1 package pnews;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.util.List;
6 import java.util.logging.Logger;
7
8 import opennlp.tools.namefind.NameFinderME;
9 import opennlp.tools.namefind.TokenNameFinderModel;
10 import opennlp.tools.tokenize.TokenizerME;
11 import opennlp.tools.tokenize.TokenizerModel;
12 import opennlp.tools.util.Span;
13 import pnews.servlet.Config;
14
15 /** http://www.devglan.com/artificial-intelligence/opennlp-named-entity-recognition-example **/
16 public class OpenNLP {
17         private static final String CLASS_NAME = OpenNLP.class.getName();
18         private static final Logger LOG = Logger.getLogger(CLASS_NAME); 
19         private static TokenNameFinderModel organizationModel;
20         private static TokenNameFinderModel personModel;
21         private static TokenNameFinderModel locationModel;
22         private static TokenNameFinderModel timeModel;
23         private static TokenizerModel tokenModel;
24
25         public static List<String> classify(String str, List<String> entities, Config config) throws IOException {
26                 classify(str, getOrganizationModel(), entities, config);
27                 
28                 classify(str, getPersonModel(), entities, config);
29                 classify(str, getLocationModel(), entities, config);
30                 
31                 classify(str, getTimeModel(), entities, config);                
32                 
33                 return entities;
34         }
35         
36         private static TokenNameFinderModel getOrganizationModel() throws IOException {
37                 synchronized (OpenNLP.class) {
38                         if (organizationModel == null) {
39                                 InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-organization.bin");
40                                 organizationModel = new TokenNameFinderModel(inputStream);
41                         }
42                 }
43                 
44                 return organizationModel;
45         }
46
47         private static TokenNameFinderModel getPersonModel() throws IOException {
48                 synchronized (OpenNLP.class) {
49                         if (personModel == null) {
50                                 InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-person.bin");
51                                 personModel = new TokenNameFinderModel(inputStream);
52                         }
53                 }
54                 
55                 return personModel;
56         }
57
58         private static TokenNameFinderModel getLocationModel() throws IOException {
59                 synchronized (OpenNLP.class) {
60                         if (locationModel == null) {
61                                 InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-location.bin");
62                                 locationModel = new TokenNameFinderModel(inputStream);
63                         }
64                 }
65                 
66                 return locationModel;
67         }
68
69         private static TokenNameFinderModel getTimeModel() throws IOException {
70                 synchronized (OpenNLP.class) {
71                         if (timeModel == null) {
72                                 InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-time.bin");
73                                 timeModel = new TokenNameFinderModel(inputStream);
74                         }
75                 }
76                 
77                 return timeModel;
78         }
79         
80         private static List<String> classify(String str, TokenNameFinderModel model, List<String> entities, Config config) throws IOException {
81                 String entity;
82                 
83                 NameFinderME nameFinder = new NameFinderME(model);
84                 String[] tokens = tokenize(str);
85                 Span nameSpans[] = nameFinder.find(tokens);
86                 
87                 for(Span s: nameSpans)  {
88                         if (s.getProb() < 0.60)
89                                 continue ;
90                         
91                         entity = null;
92                         for (int i = s.getStart(); i < s.getEnd(); i++)
93                                 if (entity == null)
94                                         entity = tokens[i];
95                                 else
96                                         entity += " " + tokens[i];
97                                 
98                         LOG.finest(entity + " " + s.getProb() + " " + s.toString());
99                         if (!config.isBlacklistedEntity(entity) && !entities.contains(entity))
100                                 entities.add(config.getEntityAlias(entity));
101                 }
102                 
103                 return entities;
104         } 
105
106         public static String[] tokenize(String sentence) throws IOException { 
107                 synchronized (OpenNLP.class) {
108                         if (tokenModel == null) {
109                                 InputStream inputStreamTokenizer = OpenNLP.class.getResourceAsStream("/en-token.bin"); 
110                                 tokenModel = new TokenizerModel(inputStreamTokenizer);
111                         }
112                 }
113                 TokenizerME tokenizer = new TokenizerME(tokenModel);
114                 return tokenizer.tokenize(sentence);
115         }
116 }