cleanup and refactored to move to net.wpitchoune package
[pnews.git] / war / src / main / java / net / wpitchoune / pnews / classifier / NamedEntityRecognizer.java
diff --git a/war/src/main/java/net/wpitchoune/pnews/classifier/NamedEntityRecognizer.java b/war/src/main/java/net/wpitchoune/pnews/classifier/NamedEntityRecognizer.java
new file mode 100644 (file)
index 0000000..0f9ee73
--- /dev/null
@@ -0,0 +1,51 @@
+package net.wpitchoune.pnews.classifier;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import edu.stanford.nlp.ie.crf.CRFClassifier;
+import edu.stanford.nlp.util.CoreMap;
+import edu.stanford.nlp.util.Triple;
+import net.wpitchoune.pnews.Config;
+
+/** https://stanfordnlp.github.io/CoreNLP/api.html */
+public class NamedEntityRecognizer {
+        private static final String CLASS_NAME = NamedEntityRecognizer.class.getName();
+        private static final Logger LOG = Logger.getLogger(CLASS_NAME);
+        private static final CRFClassifier<CoreMap> classifier = CRFClassifier.getDefaultClassifier();
+        
+        public static List<String> classify(String str, List<String> entities, Config config) throws ClassCastException, ClassNotFoundException, IOException {
+                
+                List<Triple<String, Integer, Integer>> triples;
+                String w;
+                final String FUNCTION_NAME = "classify";                       
+                
+                LOG.entering(CLASS_NAME, FUNCTION_NAME, str);
+
+                OpenNLP.classify(str, entities, config);
+                                
+                synchronized (classifier) {
+                        triples = classifier.classifyToCharacterOffsets(str);
+                }
+                 
+                for (Triple<String, Integer, Integer> t: triples) {
+                        w = str.substring(t.second, t.third);
+                        if (!config.isBlacklistedEntity(w) && !entities.contains(w))
+                                entities.add(config.getEntityAlias(w));
+                }
+                
+                LOG.exiting(CLASS_NAME, FUNCTION_NAME, entities);
+                
+                return entities;
+        }
+        
+        public static void main(String[] args) throws Exception {
+                List<String> lst;
+                
+                lst = classify("I live in Washington and New York in United States.", new ArrayList<>(), new Config());
+                for (String str: lst)
+                        System.out.println(str);
+        }
+}
\ No newline at end of file