From: Jean-Philippe Orsini Date: Fri, 3 Nov 2017 00:07:42 +0000 (+0100) Subject: limit memory by using only one classifier instance X-Git-Url: http://git.wpitchoune.net/gitweb/?p=pnews.git;a=commitdiff_plain;h=63c2717409a3235573418e6bc0d9bd0fae8356e4 limit memory by using only one classifier instance added feeds --- diff --git a/war/src/main/java/pnews/NER.java b/war/src/main/java/pnews/NER.java index bcb8951..2745868 100644 --- a/war/src/main/java/pnews/NER.java +++ b/war/src/main/java/pnews/NER.java @@ -6,19 +6,14 @@ import java.util.List; import java.util.logging.Logger; import edu.stanford.nlp.ie.crf.CRFClassifier; -import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.Triple; /** https://stanfordnlp.github.io/CoreNLP/api.html */ public class NER { private static final String CLASS_NAME = NER.class.getName(); private static final Logger LOG = Logger.getLogger(CLASS_NAME); - private static final ThreadLocal> classifier = new ThreadLocal>() { - @Override - protected CRFClassifier initialValue() { - return CRFClassifier.getDefaultClassifier(); - } - }; + private static final CRFClassifier classifier = CRFClassifier.getDefaultClassifier(); public static List classify(String str, List entities) throws ClassCastException, ClassNotFoundException, IOException { @@ -31,18 +26,20 @@ public class NER { OpenNLP.classify(str, entities); synchronized (classifier) { - triples = classifier.get().classifyToCharacterOffsets(str); - for (Triple t: triples) { - w = str.substring(t.second, t.third); - if (!entities.contains(w)) - entities.add(w); - } + triples = classifier.classifyToCharacterOffsets(str); + } + + for (Triple t: triples) { + w = str.substring(t.second, t.third); + if (!entities.contains(w)) + entities.add(w); } entities.remove("CNET"); entities.remove("Read More"); entities.remove("New"); entities.remove("App"); + entities.remove("Digital Trends"); LOG.exiting(CLASS_NAME, FUNCTION_NAME, entities); diff --git a/war/src/main/resources/feeds.json b/war/src/main/resources/feeds.json index 90df5cd..dd2b0fd 100644 --- a/war/src/main/resources/feeds.json +++ b/war/src/main/resources/feeds.json @@ -54,6 +54,11 @@ "label": "Technology", "title": "Technology", "language": "en" + }, { + "id": "en_linux", + "label": "Linux", + "title": "Linux", + "language": "en" } ], "feeds": { @@ -115,6 +120,7 @@ "http://feeds.feedburner.com/TheBoyGeniusReport?format=xml": { "categories": ["en_technologie"]}, "https://www.popsci.com/rss-technology.xml?loc=contentwell&lnk=tech&dom=section-1": { "categories": ["en_technologie"]}, "https://gizmodo.com/rss": { "categories": ["en_technologie"]}, - "https://www.space.com/home/feed/site.xml": { "categories": ["en_technologie"]} + "https://www.space.com/home/feed/site.xml": { "categories": ["en_technologie"]}, + "http://feeds.feedburner.com/d0od?format=xml": { "categories": ["en_linux"]} } }