fixed multi term of standford ner
authorJean-Philippe Orsini <orsinije@fr.ibm.com>
Thu, 2 Nov 2017 16:25:14 +0000 (17:25 +0100)
committerJean-Philippe Orsini <orsinije@fr.ibm.com>
Thu, 2 Nov 2017 16:25:14 +0000 (17:25 +0100)
war/src/main/java/pnews/NER.java
war/src/main/resources/feeds.json

index ac34c08..3a6fc82 100644 (file)
@@ -6,8 +6,8 @@ import java.util.List;
 import java.util.logging.Logger;
 
 import edu.stanford.nlp.ie.crf.CRFClassifier;
-import edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation;
 import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.util.Triple;
 
 /** https://stanfordnlp.github.io/CoreNLP/api.html */
 public class NER {
@@ -15,25 +15,23 @@ public class NER {
         private static final Logger LOG = Logger.getLogger(CLASS_NAME); 
         
         public static List<String> classify(String str, List<String> entities) throws ClassCastException, ClassNotFoundException, IOException {
-                CRFClassifier<CoreLabel> classifier;
-                List<List<CoreLabel>> out;
-                String cat, w;
-                final String FUNCTION_NAME = "classify";                
+                final CRFClassifier<CoreLabel> classifier = CRFClassifier.getDefaultClassifier();
+                List<Triple<String, Integer, Integer>> triples;
+                String w;
+                final String FUNCTION_NAME = "classify";                       
                 
                 LOG.entering(CLASS_NAME, FUNCTION_NAME, str);
 
                 OpenNLP.classify(str, entities);
-                
-                classifier = CRFClassifier.getDefaultClassifier();
-                out = classifier.classify(str);
-                
-                for (List<CoreLabel> labels: out)
-                        for (CoreLabel l: labels) {
-                                cat = l.getString(AnswerAnnotation.class);
-                                w = l.word();
-                                if (!cat.equals("O") && !entities.contains(w))
+                                
+                synchronized (classifier) {
+                        triples = classifier.classifyToCharacterOffsets(str);
+                        for (Triple<String, Integer, Integer> t: triples) {
+                                w = str.substring(t.second, t.third);
+                                if (!entities.contains(w))
                                         entities.add(w);
                         }
+                }
                 
                 entities.remove("CNET");
                 entities.remove("Read More");
@@ -46,6 +44,10 @@ public class NER {
         }
         
         public static void main(String[] args) throws Exception {
-                classify("I live in Washington.", new ArrayList<>());
+                List<String> lst;
+                
+                lst = classify("I live in Washington and New York in United States.", new ArrayList<>());
+                for (String str: lst)
+                        System.out.println(str);
         }
 }
\ No newline at end of file
index e4d9079..0118278 100644 (file)
                 "http://www.zdnet.com/news/rss.xml": { "categories": ["en_technologie"]},
                 "https://www.androidheadlines.com/feed": { "categories": ["en_technologie"]},
                 "https://www.nasa.gov/rss/dyn/breaking_news.rss": { "categories": ["en_technologie"]},
-                "http://www.computerweekly.com/rss/RSS-Feed.xml": { "categories": ["en_technologie"]}
+                "http://www.computerweekly.com/rss/RSS-Feed.xml": { "categories": ["en_technologie"]},
+                "https://www.debian.org/News/news": { "categories": ["en_technologie"]}
         }
 }