From 6d94fd5a39e6f78a68201230bd57b9ceb95e125d Mon Sep 17 00:00:00 2001 From: Jean-Philippe Orsini Date: Sat, 4 Nov 2017 23:29:11 +0100 Subject: [PATCH] blacklisted entities are now in the configuration file --- war/src/main/java/pnews/NER.java | 15 +++----- war/src/main/java/pnews/OpenNLP.java | 15 ++++---- .../main/java/pnews/servlet/ArticleProvider.java | 6 ++-- war/src/main/java/pnews/servlet/Config.java | 42 +++++++++++++++++++++- war/src/main/resources/feeds.json | 23 +++++++++--- 5 files changed, 75 insertions(+), 26 deletions(-) diff --git a/war/src/main/java/pnews/NER.java b/war/src/main/java/pnews/NER.java index 2745868..2055cf1 100644 --- a/war/src/main/java/pnews/NER.java +++ b/war/src/main/java/pnews/NER.java @@ -8,6 +8,7 @@ import java.util.logging.Logger; import edu.stanford.nlp.ie.crf.CRFClassifier; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.Triple; +import pnews.servlet.Config; /** https://stanfordnlp.github.io/CoreNLP/api.html */ public class NER { @@ -15,7 +16,7 @@ public class NER { private static final Logger LOG = Logger.getLogger(CLASS_NAME); private static final CRFClassifier classifier = CRFClassifier.getDefaultClassifier(); - public static List classify(String str, List entities) throws ClassCastException, ClassNotFoundException, IOException { + public static List classify(String str, List entities, Config config) throws ClassCastException, ClassNotFoundException, IOException { List> triples; String w; @@ -23,7 +24,7 @@ public class NER { LOG.entering(CLASS_NAME, FUNCTION_NAME, str); - OpenNLP.classify(str, entities); + OpenNLP.classify(str, entities, config); synchronized (classifier) { triples = classifier.classifyToCharacterOffsets(str); @@ -31,16 +32,10 @@ public class NER { for (Triple t: triples) { w = str.substring(t.second, t.third); - if (!entities.contains(w)) + if (!config.isBlacklistedEntity(w) && !entities.contains(w)) entities.add(w); } - entities.remove("CNET"); - entities.remove("Read More"); - entities.remove("New"); - entities.remove("App"); - entities.remove("Digital Trends"); - LOG.exiting(CLASS_NAME, FUNCTION_NAME, entities); return entities; @@ -49,7 +44,7 @@ public class NER { public static void main(String[] args) throws Exception { List lst; - lst = classify("I live in Washington and New York in United States.", new ArrayList<>()); + lst = classify("I live in Washington and New York in United States.", new ArrayList<>(), new Config()); for (String str: lst) System.out.println(str); } diff --git a/war/src/main/java/pnews/OpenNLP.java b/war/src/main/java/pnews/OpenNLP.java index 07fbba5..c383cee 100644 --- a/war/src/main/java/pnews/OpenNLP.java +++ b/war/src/main/java/pnews/OpenNLP.java @@ -10,6 +10,7 @@ import opennlp.tools.namefind.TokenNameFinderModel; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.util.Span; +import pnews.servlet.Config; /** http://www.devglan.com/artificial-intelligence/opennlp-named-entity-recognition-example **/ public class OpenNLP { @@ -21,13 +22,13 @@ public class OpenNLP { private static TokenNameFinderModel timeModel; private static TokenizerModel tokenModel; - public static List classify(String str, List entities) throws IOException { - classify(str, getOrganizationModel(), entities); + public static List classify(String str, List entities, Config config) throws IOException { + classify(str, getOrganizationModel(), entities, config); - classify(str, getPersonModel(), entities); - classify(str, getLocationModel(), entities); + classify(str, getPersonModel(), entities, config); + classify(str, getLocationModel(), entities, config); - classify(str, getTimeModel(), entities); + classify(str, getTimeModel(), entities, config); return entities; } @@ -76,7 +77,7 @@ public class OpenNLP { return timeModel; } - private static List classify(String str, TokenNameFinderModel model, List entities) throws IOException { + private static List classify(String str, TokenNameFinderModel model, List entities, Config config) throws IOException { String entity; NameFinderME nameFinder = new NameFinderME(model); @@ -95,7 +96,7 @@ public class OpenNLP { entity += " " + tokens[i]; LOG.finest(entity + " " + s.getProb() + " " + s.toString()); - if (!entities.contains(entity)) + if (!config.isBlacklistedEntity(entity) && !entities.contains(entity)) entities.add(entity); } diff --git a/war/src/main/java/pnews/servlet/ArticleProvider.java b/war/src/main/java/pnews/servlet/ArticleProvider.java index c2d8f59..ec74123 100644 --- a/war/src/main/java/pnews/servlet/ArticleProvider.java +++ b/war/src/main/java/pnews/servlet/ArticleProvider.java @@ -74,7 +74,7 @@ public class ArticleProvider { return false; } - private static Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang) { + private Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang) { String desc, title, thumbnail, feedTitle, str; Date date; List entities; @@ -111,8 +111,8 @@ public class ArticleProvider { entities = new ArrayList<>(); if (desc != null && lang.equals("en")) try { - NER.classify(title, entities); - NER.classify(desc, entities); + NER.classify(title, entities, config); + NER.classify(desc, entities, config); } catch (ClassCastException | ClassNotFoundException | IOException e1) { LOG.log(Level.SEVERE, "Cannot classify " + feedTitle, e1); } diff --git a/war/src/main/java/pnews/servlet/Config.java b/war/src/main/java/pnews/servlet/Config.java index bafb606..fec3770 100644 --- a/war/src/main/java/pnews/servlet/Config.java +++ b/war/src/main/java/pnews/servlet/Config.java @@ -6,13 +6,17 @@ import java.io.Reader; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.logging.Level; import java.util.logging.Logger; import javax.json.Json; import javax.json.JsonArray; import javax.json.JsonObject; +import javax.json.JsonString; import javax.json.JsonValue; import pnews.Category; @@ -23,11 +27,14 @@ public class Config { private Feed[] feeds; private Category[] categories; private Language[] languages; + private final Set blacklistedEntities = new HashSet<>(); + private static final String CLASS_NAME = Config.class.getName(); + /** * The key is the language, the value is the default category for this language. */ private Map defaultCategories = new HashMap<>(); - private static final Logger LOG = Logger.getLogger(Config.class.getName()); + private static final Logger LOG = Logger.getLogger(CLASS_NAME); private void loadCategories(JsonArray jcats) { List cats; @@ -75,6 +82,24 @@ public class Config { return null; } + private void loadEntities(JsonObject jroot) { + JsonObject jentities; + JsonArray jblacklist; + final String METHOD_NAME = "loadEntities"; + + jentities = jroot.getJsonObject("entities"); + jblacklist = jentities.getJsonArray("blacklist"); + + jblacklist.forEach((jv)-> { + JsonString js; + + js = (JsonString)jv; + blacklistedEntities.add(js.getString()); + }); + + LOG.logp(Level.FINEST, CLASS_NAME, METHOD_NAME, " blacklistedEntities=" + blacklistedEntities); + } + public void loadConfig() throws UnsupportedEncodingException { Reader r; JsonObject jfeeds, jroot; @@ -116,6 +141,21 @@ public class Config { }); feeds = feedList.toArray(new Feed[0]); + + loadEntities(jroot); + } + + public boolean isBlacklistedEntity(String e) { + final String METHOD_NAME = "isBlacklistedEntity"; + boolean result; + + LOG.entering(CLASS_NAME, METHOD_NAME, e); + + result = blacklistedEntities.contains(e); + + LOG.exiting(CLASS_NAME, METHOD_NAME, result); + + return result; } public Feed[] getFeeds() { diff --git a/war/src/main/resources/feeds.json b/war/src/main/resources/feeds.json index ecb8b1a..a06ae41 100644 --- a/war/src/main/resources/feeds.json +++ b/war/src/main/resources/feeds.json @@ -91,6 +91,7 @@ "http://www.europe1.fr/var/export/rss/europe1/sciences.xml": { "categories": ["technologie"] }, "http://www.europe1.fr/var/export/rss/europe1/technologies.xml": { "categories": ["technologie"]}, "http://feeds.feedburner.com/lesnumeriques/news": { "categories": ["technologie"] }, + "https://www.nextinpact.com/rss/acces-libre.xml": { "categories": ["technologie"] }, "http://www.zdnet.fr/feeds/rss/actualites/": { "categories": ["technologie"] }, "http://www.frandroid.com/feed": { "categories": ["technologie"] }, "http://www.silicon.fr/feed": { "categories": ["technologie"] }, @@ -99,9 +100,6 @@ "https://www-03.ibm.com/press/fr/fr/rssfeed.wss?keyword=null&maxFeed=&feedType=RSS&topic=all": { "categories": ["technologie"] }, "https://korben.info/feed": { "categories": ["technologie"]}, "https://www.techhive.com/index.rss": { "categories": ["en_technologie"]}, - "https://www.gnome.org/feed/": { "categories": ["en_technologie"]}, - "http://www.markshuttleworth.com/feed": { "categories": ["en_technologie"]}, - "https://insights.ubuntu.com/feed/": { "categories": ["en_technologie"]}, "https://www-03.ibm.com/press/us/en/rssfeed.wss?keyword=null&maxFeed=&feedType=RSS&topic=all": { "categories": ["en_technologie"]}, "https://www.cnet.com/rss/news/": { "categories": ["en_technologie"]}, "https://www.pcworld.com/index.rss": { "categories": ["en_technologie"]}, @@ -114,13 +112,28 @@ "https://www.androidheadlines.com/feed": { "categories": ["en_technologie"]}, "https://www.nasa.gov/rss/dyn/breaking_news.rss": { "categories": ["en_technologie"]}, "http://www.computerweekly.com/rss/RSS-Feed.xml": { "categories": ["en_technologie"]}, - "https://www.debian.org/News/news": { "categories": ["en_technologie"]}, "https://www.theverge.com/rss/index.xml": { "categories": ["en_technologie"]}, "https://www.engadget.com/rss.xml": { "categories": ["en_technologie"]}, "http://feeds.feedburner.com/TheBoyGeniusReport?format=xml": { "categories": ["en_technologie"]}, "https://www.popsci.com/rss-technology.xml?loc=contentwell&lnk=tech&dom=section-1": { "categories": ["en_technologie"]}, "https://gizmodo.com/rss": { "categories": ["en_technologie"]}, "https://www.space.com/home/feed/site.xml": { "categories": ["en_technologie"]}, - "http://feeds.feedburner.com/d0od?format=xml": { "categories": ["en_linux"]} + "http://feeds.feedburner.com/d0od?format=xml": { "categories": ["en_linux"]}, + "https://www.debian.org/News/news": { "categories": ["en_linux"]}, + "http://www.markshuttleworth.com/feed": { "categories": ["en_linux"]}, + "https://insights.ubuntu.com/feed/": { "categories": ["en_linux"]}, + "http://feeds.feedburner.com/LinuxJournal-BreakingNews?format=xml": { "categories": ["en_linux"]}, + "https://www.gnome.org/feed/": { "categories": ["en_linux"]}, + "http://linuxreviews.org/en.rss": { "categories": ["en_linux"]}, + "http://www.linux-magazine.com/rss/feed/lmi_news": { "categories": ["en_linux"]} + }, + "entities": { + "blacklist": [ + "CNET", + "Read More", + "Digital Trends", + "Joey Sneddon", + "CA" + ] } } -- 2.7.4