import java.util.logging.Logger;
import edu.stanford.nlp.ie.crf.CRFClassifier;
-import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Triple;
/** https://stanfordnlp.github.io/CoreNLP/api.html */
public class NER {
private static final String CLASS_NAME = NER.class.getName();
private static final Logger LOG = Logger.getLogger(CLASS_NAME);
- private static final ThreadLocal<CRFClassifier<CoreLabel>> classifier = new ThreadLocal<CRFClassifier<CoreLabel>>() {
- @Override
- protected CRFClassifier<CoreLabel> initialValue() {
- return CRFClassifier.getDefaultClassifier();
- }
- };
+ private static final CRFClassifier<CoreMap> classifier = CRFClassifier.getDefaultClassifier();
public static List<String> classify(String str, List<String> entities) throws ClassCastException, ClassNotFoundException, IOException {
OpenNLP.classify(str, entities);
synchronized (classifier) {
- triples = classifier.get().classifyToCharacterOffsets(str);
- for (Triple<String, Integer, Integer> t: triples) {
- w = str.substring(t.second, t.third);
- if (!entities.contains(w))
- entities.add(w);
- }
+ triples = classifier.classifyToCharacterOffsets(str);
+ }
+
+ for (Triple<String, Integer, Integer> t: triples) {
+ w = str.substring(t.second, t.third);
+ if (!entities.contains(w))
+ entities.add(w);
}
entities.remove("CNET");
entities.remove("Read More");
entities.remove("New");
entities.remove("App");
+ entities.remove("Digital Trends");
LOG.exiting(CLASS_NAME, FUNCTION_NAME, entities);
"label": "Technology",
"title": "Technology",
"language": "en"
+ }, {
+ "id": "en_linux",
+ "label": "Linux",
+ "title": "Linux",
+ "language": "en"
}
],
"feeds": {
"http://feeds.feedburner.com/TheBoyGeniusReport?format=xml": { "categories": ["en_technologie"]},
"https://www.popsci.com/rss-technology.xml?loc=contentwell&lnk=tech&dom=section-1": { "categories": ["en_technologie"]},
"https://gizmodo.com/rss": { "categories": ["en_technologie"]},
- "https://www.space.com/home/feed/site.xml": { "categories": ["en_technologie"]}
+ "https://www.space.com/home/feed/site.xml": { "categories": ["en_technologie"]},
+ "http://feeds.feedburner.com/d0od?format=xml": { "categories": ["en_linux"]}
}
}