limit memory by using only one classifier instance
authorJean-Philippe Orsini <orsinije@fr.ibm.com>
Fri, 3 Nov 2017 00:07:42 +0000 (01:07 +0100)
committerJean-Philippe Orsini <orsinije@fr.ibm.com>
Fri, 3 Nov 2017 00:07:42 +0000 (01:07 +0100)
added feeds

war/src/main/java/pnews/NER.java
war/src/main/resources/feeds.json

index bcb8951..2745868 100644 (file)
@@ -6,19 +6,14 @@ import java.util.List;
 import java.util.logging.Logger;
 
 import edu.stanford.nlp.ie.crf.CRFClassifier;
-import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.util.CoreMap;
 import edu.stanford.nlp.util.Triple;
 
 /** https://stanfordnlp.github.io/CoreNLP/api.html */
 public class NER {
         private static final String CLASS_NAME = NER.class.getName();
         private static final Logger LOG = Logger.getLogger(CLASS_NAME);
-        private static final ThreadLocal<CRFClassifier<CoreLabel>> classifier = new ThreadLocal<CRFClassifier<CoreLabel>>() {
-                @Override 
-                protected CRFClassifier<CoreLabel> initialValue() {
-                        return CRFClassifier.getDefaultClassifier();
-                }
-        };
+        private static final CRFClassifier<CoreMap> classifier = CRFClassifier.getDefaultClassifier();
         
         public static List<String> classify(String str, List<String> entities) throws ClassCastException, ClassNotFoundException, IOException {
                 
@@ -31,18 +26,20 @@ public class NER {
                 OpenNLP.classify(str, entities);
                                 
                 synchronized (classifier) {
-                        triples = classifier.get().classifyToCharacterOffsets(str);
-                        for (Triple<String, Integer, Integer> t: triples) {
-                                w = str.substring(t.second, t.third);
-                                if (!entities.contains(w))
-                                        entities.add(w);
-                        }
+                        triples = classifier.classifyToCharacterOffsets(str);
+                }
+                 
+                for (Triple<String, Integer, Integer> t: triples) {
+                        w = str.substring(t.second, t.third);
+                        if (!entities.contains(w))
+                                entities.add(w);
                 }
                 
                 entities.remove("CNET");
                 entities.remove("Read More");
                 entities.remove("New");
                 entities.remove("App");
+                entities.remove("Digital Trends");
                 
                 LOG.exiting(CLASS_NAME, FUNCTION_NAME, entities);
                 
index 90df5cd..dd2b0fd 100644 (file)
                         "label": "Technology",
                         "title": "Technology",
                         "language": "en"
+                }, {
+                        "id": "en_linux",
+                        "label": "Linux",
+                        "title": "Linux",
+                        "language": "en"
                 }
         ],
         "feeds": {
                 "http://feeds.feedburner.com/TheBoyGeniusReport?format=xml": { "categories": ["en_technologie"]},
                 "https://www.popsci.com/rss-technology.xml?loc=contentwell&lnk=tech&dom=section-1": { "categories": ["en_technologie"]},
                 "https://gizmodo.com/rss": { "categories": ["en_technologie"]},
-                "https://www.space.com/home/feed/site.xml": { "categories": ["en_technologie"]}
+                "https://www.space.com/home/feed/site.xml": { "categories": ["en_technologie"]},
+                "http://feeds.feedburner.com/d0od?format=xml": { "categories": ["en_linux"]}
         }
 }