added openlp support
[pnews.git] / war / src / main / java / pnews / OpenNLP.java
1 package pnews;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.util.List;
6 import java.util.logging.Logger;
7
8 import opennlp.tools.namefind.NameFinderME;
9 import opennlp.tools.namefind.TokenNameFinderModel;
10 import opennlp.tools.tokenize.TokenizerME;
11 import opennlp.tools.tokenize.TokenizerModel;
12 import opennlp.tools.util.Span;
13
14 /** http://www.devglan.com/artificial-intelligence/opennlp-named-entity-recognition-example **/
15 public class OpenNLP {
16         private static final String CLASS_NAME = OpenNLP.class.getName();
17         private static final Logger LOG = Logger.getLogger(CLASS_NAME); 
18         private static TokenNameFinderModel organizationModel;
19         private static TokenNameFinderModel personModel;
20         private static TokenNameFinderModel locationModel;
21         private static TokenNameFinderModel timeModel;
22         private static TokenizerModel tokenModel;
23
24         public static List<String> classify(String str, List<String> entities) throws IOException {
25                 classify(str, getOrganizationModel(), entities);
26                 
27                 classify(str, getPersonModel(), entities);
28                 classify(str, getLocationModel(), entities);
29                 
30                 classify(str, getTimeModel(), entities);                
31                 
32                 return entities;
33         }
34         
35         private static TokenNameFinderModel getOrganizationModel() throws IOException {
36                 synchronized (OpenNLP.class) {
37                         if (organizationModel == null) {
38                                 InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-organization.bin");
39                                 organizationModel = new TokenNameFinderModel(inputStream);
40                         }
41                 }
42                 
43                 return organizationModel;
44         }
45
46         private static TokenNameFinderModel getPersonModel() throws IOException {
47                 synchronized (OpenNLP.class) {
48                         if (personModel == null) {
49                                 InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-person.bin");
50                                 personModel = new TokenNameFinderModel(inputStream);
51                         }
52                 }
53                 
54                 return personModel;
55         }
56
57         private static TokenNameFinderModel getLocationModel() throws IOException {
58                 synchronized (OpenNLP.class) {
59                         if (locationModel == null) {
60                                 InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-location.bin");
61                                 locationModel = new TokenNameFinderModel(inputStream);
62                         }
63                 }
64                 
65                 return locationModel;
66         }
67
68         private static TokenNameFinderModel getTimeModel() throws IOException {
69                 synchronized (OpenNLP.class) {
70                         if (timeModel == null) {
71                                 InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-time.bin");
72                                 timeModel = new TokenNameFinderModel(inputStream);
73                         }
74                 }
75                 
76                 return timeModel;
77         }
78         
79         private static List<String> classify(String str, TokenNameFinderModel model, List<String> entities) throws IOException {
80                 String entity;
81                 
82                 NameFinderME nameFinder = new NameFinderME(model);
83                 String[] tokens = tokenize(str);
84                 Span nameSpans[] = nameFinder.find(tokens);
85                 
86                 for(Span s: nameSpans)  {
87                         if (s.getProb() < 0.60)
88                                 continue ;
89                         
90                         entity = null;
91                         for (int i = s.getStart(); i < s.getEnd(); i++)
92                                 if (entity == null)
93                                         entity = tokens[i];
94                                 else
95                                         entity += " " + tokens[i];
96                                 
97                         LOG.finest(entity + " " + s.getProb() + " " + s.toString());
98                         if (!entities.contains(entity))
99                                 entities.add(entity);
100                 }
101                 
102                 return entities;
103         } 
104
105         public static String[] tokenize(String sentence) throws IOException { 
106                 synchronized (OpenNLP.class) {
107                         if (tokenModel == null) {
108                                 InputStream inputStreamTokenizer = OpenNLP.class.getResourceAsStream("/en-token.bin"); 
109                                 tokenModel = new TokenizerModel(inputStreamTokenizer);
110                         }
111                 }
112                 TokenizerME tokenizer = new TokenizerME(tokenModel);
113                 return tokenizer.tokenize(sentence);
114         }
115 }