cleanup and refactored to move to net.wpitchoune package
[pnews.git] / war / src / main / java / net / wpitchoune / pnews / classifier / OpenNLP.java
diff --git a/war/src/main/java/net/wpitchoune/pnews/classifier/OpenNLP.java b/war/src/main/java/net/wpitchoune/pnews/classifier/OpenNLP.java
new file mode 100644 (file)
index 0000000..ff9707d
--- /dev/null
@@ -0,0 +1,101 @@
+package net.wpitchoune.pnews.classifier;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+import java.util.logging.Logger;
+
+import net.wpitchoune.pnews.Config;
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.Span;
+
+/** http://www.devglan.com/artificial-intelligence/opennlp-named-entity-recognition-example **/
+public class OpenNLP {
+        private static final String CLASS_NAME = OpenNLP.class.getName();
+        private static final Logger LOG = Logger.getLogger(CLASS_NAME); 
+        private static TokenNameFinderModel organizationModel;
+        private static TokenNameFinderModel personModel;
+        private static TokenNameFinderModel locationModel;
+        private static TokenizerModel tokenModel;
+
+        public static List<String> classify(String str, List<String> entities, Config config) throws IOException {
+                classify(str, getOrganizationModel(), entities, config);
+                classify(str, getPersonModel(), entities, config);
+                classify(str, getLocationModel(), entities, config);
+                
+                return entities;
+        }
+        
+        private static TokenNameFinderModel getOrganizationModel() throws IOException {
+                synchronized (OpenNLP.class) {
+                        if (organizationModel == null) {
+                                InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-organization.bin");
+                                organizationModel = new TokenNameFinderModel(inputStream);
+                        }
+                }
+                
+                return organizationModel;
+        }
+
+        private static TokenNameFinderModel getPersonModel() throws IOException {
+                synchronized (OpenNLP.class) {
+                        if (personModel == null) {
+                                InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-person.bin");
+                                personModel = new TokenNameFinderModel(inputStream);
+                        }
+                }
+                
+                return personModel;
+        }
+
+        private static TokenNameFinderModel getLocationModel() throws IOException {
+                synchronized (OpenNLP.class) {
+                        if (locationModel == null) {
+                                InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-location.bin");
+                                locationModel = new TokenNameFinderModel(inputStream);
+                        }
+                }
+                
+                return locationModel;
+        }
+        
+        private static List<String> classify(String str, TokenNameFinderModel model, List<String> entities, Config config) throws IOException {
+                String entity;
+                
+                NameFinderME nameFinder = new NameFinderME(model);
+                String[] tokens = tokenize(str);
+                Span nameSpans[] = nameFinder.find(tokens);
+                
+                for(Span s: nameSpans)  {
+                        if (s.getProb() < 0.60)
+                                continue ;
+                        
+                        entity = null;
+                        for (int i = s.getStart(); i < s.getEnd(); i++)
+                                if (entity == null)
+                                        entity = tokens[i];
+                                else
+                                        entity += " " + tokens[i];
+                                
+                        LOG.finest(entity + " " + s.getProb() + " " + s.toString());
+                        if (!config.isBlacklistedEntity(entity) && !entities.contains(entity))
+                                entities.add(config.getEntityAlias(entity));
+                }
+                
+                return entities;
+        } 
+
+        private static String[] tokenize(String sentence) throws IOException { 
+                synchronized (OpenNLP.class) {
+                        if (tokenModel == null) {
+                                InputStream inputStreamTokenizer = OpenNLP.class.getResourceAsStream("/en-token.bin"); 
+                                tokenModel = new TokenizerModel(inputStreamTokenizer);
+                        }
+                }
+                TokenizerME tokenizer = new TokenizerME(tokenModel);
+                return tokenizer.tokenize(sentence);
+        }
+}