blacklisted entities are now in the configuration file
authorJean-Philippe Orsini <orsinije@fr.ibm.com>
Sat, 4 Nov 2017 22:29:11 +0000 (23:29 +0100)
committerJean-Philippe Orsini <orsinije@fr.ibm.com>
Sat, 4 Nov 2017 22:29:11 +0000 (23:29 +0100)
war/src/main/java/pnews/NER.java
war/src/main/java/pnews/OpenNLP.java
war/src/main/java/pnews/servlet/ArticleProvider.java
war/src/main/java/pnews/servlet/Config.java
war/src/main/resources/feeds.json

index 2745868..2055cf1 100644 (file)
@@ -8,6 +8,7 @@ import java.util.logging.Logger;
 import edu.stanford.nlp.ie.crf.CRFClassifier;
 import edu.stanford.nlp.util.CoreMap;
 import edu.stanford.nlp.util.Triple;
+import pnews.servlet.Config;
 
 /** https://stanfordnlp.github.io/CoreNLP/api.html */
 public class NER {
@@ -15,7 +16,7 @@ public class NER {
         private static final Logger LOG = Logger.getLogger(CLASS_NAME);
         private static final CRFClassifier<CoreMap> classifier = CRFClassifier.getDefaultClassifier();
         
-        public static List<String> classify(String str, List<String> entities) throws ClassCastException, ClassNotFoundException, IOException {
+        public static List<String> classify(String str, List<String> entities, Config config) throws ClassCastException, ClassNotFoundException, IOException {
                 
                 List<Triple<String, Integer, Integer>> triples;
                 String w;
@@ -23,7 +24,7 @@ public class NER {
                 
                 LOG.entering(CLASS_NAME, FUNCTION_NAME, str);
 
-                OpenNLP.classify(str, entities);
+                OpenNLP.classify(str, entities, config);
                                 
                 synchronized (classifier) {
                         triples = classifier.classifyToCharacterOffsets(str);
@@ -31,16 +32,10 @@ public class NER {
                  
                 for (Triple<String, Integer, Integer> t: triples) {
                         w = str.substring(t.second, t.third);
-                        if (!entities.contains(w))
+                        if (!config.isBlacklistedEntity(w) && !entities.contains(w))
                                 entities.add(w);
                 }
                 
-                entities.remove("CNET");
-                entities.remove("Read More");
-                entities.remove("New");
-                entities.remove("App");
-                entities.remove("Digital Trends");
-                
                 LOG.exiting(CLASS_NAME, FUNCTION_NAME, entities);
                 
                 return entities;
@@ -49,7 +44,7 @@ public class NER {
         public static void main(String[] args) throws Exception {
                 List<String> lst;
                 
-                lst = classify("I live in Washington and New York in United States.", new ArrayList<>());
+                lst = classify("I live in Washington and New York in United States.", new ArrayList<>(), new Config());
                 for (String str: lst)
                         System.out.println(str);
         }
index 07fbba5..c383cee 100644 (file)
@@ -10,6 +10,7 @@ import opennlp.tools.namefind.TokenNameFinderModel;
 import opennlp.tools.tokenize.TokenizerME;
 import opennlp.tools.tokenize.TokenizerModel;
 import opennlp.tools.util.Span;
+import pnews.servlet.Config;
 
 /** http://www.devglan.com/artificial-intelligence/opennlp-named-entity-recognition-example **/
 public class OpenNLP {
@@ -21,13 +22,13 @@ public class OpenNLP {
         private static TokenNameFinderModel timeModel;
         private static TokenizerModel tokenModel;
 
-        public static List<String> classify(String str, List<String> entities) throws IOException {
-                classify(str, getOrganizationModel(), entities);
+        public static List<String> classify(String str, List<String> entities, Config config) throws IOException {
+                classify(str, getOrganizationModel(), entities, config);
                 
-                classify(str, getPersonModel(), entities);
-                classify(str, getLocationModel(), entities);
+                classify(str, getPersonModel(), entities, config);
+                classify(str, getLocationModel(), entities, config);
                 
-                classify(str, getTimeModel(), entities);                
+                classify(str, getTimeModel(), entities, config);                
                 
                 return entities;
         }
@@ -76,7 +77,7 @@ public class OpenNLP {
                 return timeModel;
         }
         
-        private static List<String> classify(String str, TokenNameFinderModel model, List<String> entities) throws IOException {
+        private static List<String> classify(String str, TokenNameFinderModel model, List<String> entities, Config config) throws IOException {
                 String entity;
                 
                 NameFinderME nameFinder = new NameFinderME(model);
@@ -95,7 +96,7 @@ public class OpenNLP {
                                         entity += " " + tokens[i];
                                 
                         LOG.finest(entity + " " + s.getProb() + " " + s.toString());
-                        if (!entities.contains(entity))
+                        if (!config.isBlacklistedEntity(entity) && !entities.contains(entity))
                                 entities.add(entity);
                 }
                 
index c2d8f59..ec74123 100644 (file)
@@ -74,7 +74,7 @@ public class ArticleProvider {
                 return false;
         }
         
-        private static Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang) {
+        private Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang) {
                 String desc, title, thumbnail, feedTitle, str;
                 Date date;
                 List<String> entities;
@@ -111,8 +111,8 @@ public class ArticleProvider {
                 entities = new ArrayList<>();
                 if (desc != null && lang.equals("en"))
                         try {
-                                NER.classify(title, entities);
-                                NER.classify(desc, entities);
+                                NER.classify(title, entities, config);
+                                NER.classify(desc, entities, config);
                         } catch (ClassCastException | ClassNotFoundException | IOException e1) {
                                 LOG.log(Level.SEVERE, "Cannot classify " + feedTitle, e1);                         
                         }
index bafb606..fec3770 100644 (file)
@@ -6,13 +6,17 @@ import java.io.Reader;
 import java.io.UnsupportedEncodingException;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
+import java.util.logging.Level;
 import java.util.logging.Logger;
 
 import javax.json.Json;
 import javax.json.JsonArray;
 import javax.json.JsonObject;
+import javax.json.JsonString;
 import javax.json.JsonValue;
 
 import pnews.Category;
@@ -23,11 +27,14 @@ public class Config {
         private Feed[] feeds;
         private Category[] categories;
         private Language[] languages;
+        private final Set<String> blacklistedEntities = new HashSet<>();
+        private static final String CLASS_NAME = Config.class.getName();
+        
         /**
          * The key is the language, the value is the default category for this language.
          */
         private Map<String, Category> defaultCategories = new HashMap<>();
-        private static final Logger LOG = Logger.getLogger(Config.class.getName());
+        private static final Logger LOG = Logger.getLogger(CLASS_NAME);
                 
         private void loadCategories(JsonArray jcats) {
                 List<Category> cats;
@@ -75,6 +82,24 @@ public class Config {
                 return null;
         }
         
+        private void loadEntities(JsonObject jroot) {
+                JsonObject jentities;
+                JsonArray jblacklist;
+                final String METHOD_NAME = "loadEntities";
+                
+                jentities = jroot.getJsonObject("entities");
+                jblacklist = jentities.getJsonArray("blacklist");
+                
+                jblacklist.forEach((jv)-> {
+                        JsonString js;
+                        
+                        js = (JsonString)jv;
+                        blacklistedEntities.add(js.getString());
+                });
+                
+                LOG.logp(Level.FINEST, CLASS_NAME, METHOD_NAME, " blacklistedEntities=" + blacklistedEntities);
+        }
+        
         public void loadConfig() throws UnsupportedEncodingException {
                 Reader r;
                 JsonObject jfeeds, jroot;
@@ -116,6 +141,21 @@ public class Config {
                 });
                 
                 feeds = feedList.toArray(new Feed[0]);
+                
+                loadEntities(jroot);
+        }
+        
+        public boolean isBlacklistedEntity(String e) {
+                final String METHOD_NAME = "isBlacklistedEntity";
+                boolean result;
+                
+                LOG.entering(CLASS_NAME, METHOD_NAME, e);
+                
+                result = blacklistedEntities.contains(e);
+                
+                LOG.exiting(CLASS_NAME, METHOD_NAME, result);
+                
+                return result;
         }
         
         public Feed[] getFeeds() {
index ecb8b1a..a06ae41 100644 (file)
@@ -91,6 +91,7 @@
                 "http://www.europe1.fr/var/export/rss/europe1/sciences.xml": { "categories": ["technologie"] },
                 "http://www.europe1.fr/var/export/rss/europe1/technologies.xml": { "categories": ["technologie"]},
                 "http://feeds.feedburner.com/lesnumeriques/news": { "categories": ["technologie"] },
+                "https://www.nextinpact.com/rss/acces-libre.xml": { "categories": ["technologie"] },
                 "http://www.zdnet.fr/feeds/rss/actualites/": { "categories": ["technologie"] },
                 "http://www.frandroid.com/feed": { "categories": ["technologie"] },
                 "http://www.silicon.fr/feed": { "categories": ["technologie"] },
                 "https://www-03.ibm.com/press/fr/fr/rssfeed.wss?keyword=null&maxFeed=&feedType=RSS&topic=all": { "categories": ["technologie"] },
                 "https://korben.info/feed": { "categories": ["technologie"]},
                 "https://www.techhive.com/index.rss": { "categories": ["en_technologie"]},
-                "https://www.gnome.org/feed/": { "categories": ["en_technologie"]},
-                "http://www.markshuttleworth.com/feed": { "categories": ["en_technologie"]},
-                "https://insights.ubuntu.com/feed/": { "categories": ["en_technologie"]},
                 "https://www-03.ibm.com/press/us/en/rssfeed.wss?keyword=null&maxFeed=&feedType=RSS&topic=all": { "categories": ["en_technologie"]},
                 "https://www.cnet.com/rss/news/": { "categories": ["en_technologie"]},
                 "https://www.pcworld.com/index.rss": { "categories": ["en_technologie"]},
                 "https://www.androidheadlines.com/feed": { "categories": ["en_technologie"]},
                 "https://www.nasa.gov/rss/dyn/breaking_news.rss": { "categories": ["en_technologie"]},
                 "http://www.computerweekly.com/rss/RSS-Feed.xml": { "categories": ["en_technologie"]},
-                "https://www.debian.org/News/news": { "categories": ["en_technologie"]},
                 "https://www.theverge.com/rss/index.xml": { "categories": ["en_technologie"]},
                 "https://www.engadget.com/rss.xml": { "categories": ["en_technologie"]},
                 "http://feeds.feedburner.com/TheBoyGeniusReport?format=xml": { "categories": ["en_technologie"]},
                 "https://www.popsci.com/rss-technology.xml?loc=contentwell&lnk=tech&dom=section-1": { "categories": ["en_technologie"]},
                 "https://gizmodo.com/rss": { "categories": ["en_technologie"]},
                 "https://www.space.com/home/feed/site.xml": { "categories": ["en_technologie"]},
-                "http://feeds.feedburner.com/d0od?format=xml": { "categories": ["en_linux"]}
+                "http://feeds.feedburner.com/d0od?format=xml": { "categories": ["en_linux"]},
+                "https://www.debian.org/News/news": { "categories": ["en_linux"]},
+                "http://www.markshuttleworth.com/feed": { "categories": ["en_linux"]},
+                "https://insights.ubuntu.com/feed/": { "categories": ["en_linux"]},
+                "http://feeds.feedburner.com/LinuxJournal-BreakingNews?format=xml": { "categories": ["en_linux"]},
+                "https://www.gnome.org/feed/": { "categories": ["en_linux"]},
+                "http://linuxreviews.org/en.rss": { "categories": ["en_linux"]},
+                "http://www.linux-magazine.com/rss/feed/lmi_news": { "categories": ["en_linux"]}
+        },
+        "entities": {
+                "blacklist": [
+                        "CNET",
+                        "Read More",
+                        "Digital Trends",
+                        "Joey Sneddon",
+                        "CA"
+                ]
         }
 }