import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Triple;
+import pnews.servlet.Config;
/** https://stanfordnlp.github.io/CoreNLP/api.html */
public class NER {
private static final Logger LOG = Logger.getLogger(CLASS_NAME);
private static final CRFClassifier<CoreMap> classifier = CRFClassifier.getDefaultClassifier();
- public static List<String> classify(String str, List<String> entities) throws ClassCastException, ClassNotFoundException, IOException {
+ public static List<String> classify(String str, List<String> entities, Config config) throws ClassCastException, ClassNotFoundException, IOException {
List<Triple<String, Integer, Integer>> triples;
String w;
LOG.entering(CLASS_NAME, FUNCTION_NAME, str);
- OpenNLP.classify(str, entities);
+ OpenNLP.classify(str, entities, config);
synchronized (classifier) {
triples = classifier.classifyToCharacterOffsets(str);
for (Triple<String, Integer, Integer> t: triples) {
w = str.substring(t.second, t.third);
- if (!entities.contains(w))
+ if (!config.isBlacklistedEntity(w) && !entities.contains(w))
entities.add(w);
}
- entities.remove("CNET");
- entities.remove("Read More");
- entities.remove("New");
- entities.remove("App");
- entities.remove("Digital Trends");
-
LOG.exiting(CLASS_NAME, FUNCTION_NAME, entities);
return entities;
public static void main(String[] args) throws Exception {
List<String> lst;
- lst = classify("I live in Washington and New York in United States.", new ArrayList<>());
+ lst = classify("I live in Washington and New York in United States.", new ArrayList<>(), new Config());
for (String str: lst)
System.out.println(str);
}
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.Span;
+import pnews.servlet.Config;
/** http://www.devglan.com/artificial-intelligence/opennlp-named-entity-recognition-example **/
public class OpenNLP {
private static TokenNameFinderModel timeModel;
private static TokenizerModel tokenModel;
- public static List<String> classify(String str, List<String> entities) throws IOException {
- classify(str, getOrganizationModel(), entities);
+ public static List<String> classify(String str, List<String> entities, Config config) throws IOException {
+ classify(str, getOrganizationModel(), entities, config);
- classify(str, getPersonModel(), entities);
- classify(str, getLocationModel(), entities);
+ classify(str, getPersonModel(), entities, config);
+ classify(str, getLocationModel(), entities, config);
- classify(str, getTimeModel(), entities);
+ classify(str, getTimeModel(), entities, config);
return entities;
}
return timeModel;
}
- private static List<String> classify(String str, TokenNameFinderModel model, List<String> entities) throws IOException {
+ private static List<String> classify(String str, TokenNameFinderModel model, List<String> entities, Config config) throws IOException {
String entity;
NameFinderME nameFinder = new NameFinderME(model);
entity += " " + tokens[i];
LOG.finest(entity + " " + s.getProb() + " " + s.toString());
- if (!entities.contains(entity))
+ if (!config.isBlacklistedEntity(entity) && !entities.contains(entity))
entities.add(entity);
}
return false;
}
- private static Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang) {
+ private Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang) {
String desc, title, thumbnail, feedTitle, str;
Date date;
List<String> entities;
entities = new ArrayList<>();
if (desc != null && lang.equals("en"))
try {
- NER.classify(title, entities);
- NER.classify(desc, entities);
+ NER.classify(title, entities, config);
+ NER.classify(desc, entities, config);
} catch (ClassCastException | ClassNotFoundException | IOException e1) {
LOG.log(Level.SEVERE, "Cannot classify " + feedTitle, e1);
}
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Set;
+import java.util.logging.Level;
import java.util.logging.Logger;
import javax.json.Json;
import javax.json.JsonArray;
import javax.json.JsonObject;
+import javax.json.JsonString;
import javax.json.JsonValue;
import pnews.Category;
private Feed[] feeds;
private Category[] categories;
private Language[] languages;
+ private final Set<String> blacklistedEntities = new HashSet<>();
+ private static final String CLASS_NAME = Config.class.getName();
+
/**
* The key is the language, the value is the default category for this language.
*/
private Map<String, Category> defaultCategories = new HashMap<>();
- private static final Logger LOG = Logger.getLogger(Config.class.getName());
+ private static final Logger LOG = Logger.getLogger(CLASS_NAME);
private void loadCategories(JsonArray jcats) {
List<Category> cats;
return null;
}
+ private void loadEntities(JsonObject jroot) {
+ JsonObject jentities;
+ JsonArray jblacklist;
+ final String METHOD_NAME = "loadEntities";
+
+ jentities = jroot.getJsonObject("entities");
+ jblacklist = jentities.getJsonArray("blacklist");
+
+ jblacklist.forEach((jv)-> {
+ JsonString js;
+
+ js = (JsonString)jv;
+ blacklistedEntities.add(js.getString());
+ });
+
+ LOG.logp(Level.FINEST, CLASS_NAME, METHOD_NAME, " blacklistedEntities=" + blacklistedEntities);
+ }
+
public void loadConfig() throws UnsupportedEncodingException {
Reader r;
JsonObject jfeeds, jroot;
});
feeds = feedList.toArray(new Feed[0]);
+
+ loadEntities(jroot);
+ }
+
+ public boolean isBlacklistedEntity(String e) {
+ final String METHOD_NAME = "isBlacklistedEntity";
+ boolean result;
+
+ LOG.entering(CLASS_NAME, METHOD_NAME, e);
+
+ result = blacklistedEntities.contains(e);
+
+ LOG.exiting(CLASS_NAME, METHOD_NAME, result);
+
+ return result;
}
public Feed[] getFeeds() {
"http://www.europe1.fr/var/export/rss/europe1/sciences.xml": { "categories": ["technologie"] },
"http://www.europe1.fr/var/export/rss/europe1/technologies.xml": { "categories": ["technologie"]},
"http://feeds.feedburner.com/lesnumeriques/news": { "categories": ["technologie"] },
+ "https://www.nextinpact.com/rss/acces-libre.xml": { "categories": ["technologie"] },
"http://www.zdnet.fr/feeds/rss/actualites/": { "categories": ["technologie"] },
"http://www.frandroid.com/feed": { "categories": ["technologie"] },
"http://www.silicon.fr/feed": { "categories": ["technologie"] },
"https://www-03.ibm.com/press/fr/fr/rssfeed.wss?keyword=null&maxFeed=&feedType=RSS&topic=all": { "categories": ["technologie"] },
"https://korben.info/feed": { "categories": ["technologie"]},
"https://www.techhive.com/index.rss": { "categories": ["en_technologie"]},
- "https://www.gnome.org/feed/": { "categories": ["en_technologie"]},
- "http://www.markshuttleworth.com/feed": { "categories": ["en_technologie"]},
- "https://insights.ubuntu.com/feed/": { "categories": ["en_technologie"]},
"https://www-03.ibm.com/press/us/en/rssfeed.wss?keyword=null&maxFeed=&feedType=RSS&topic=all": { "categories": ["en_technologie"]},
"https://www.cnet.com/rss/news/": { "categories": ["en_technologie"]},
"https://www.pcworld.com/index.rss": { "categories": ["en_technologie"]},
"https://www.androidheadlines.com/feed": { "categories": ["en_technologie"]},
"https://www.nasa.gov/rss/dyn/breaking_news.rss": { "categories": ["en_technologie"]},
"http://www.computerweekly.com/rss/RSS-Feed.xml": { "categories": ["en_technologie"]},
- "https://www.debian.org/News/news": { "categories": ["en_technologie"]},
"https://www.theverge.com/rss/index.xml": { "categories": ["en_technologie"]},
"https://www.engadget.com/rss.xml": { "categories": ["en_technologie"]},
"http://feeds.feedburner.com/TheBoyGeniusReport?format=xml": { "categories": ["en_technologie"]},
"https://www.popsci.com/rss-technology.xml?loc=contentwell&lnk=tech&dom=section-1": { "categories": ["en_technologie"]},
"https://gizmodo.com/rss": { "categories": ["en_technologie"]},
"https://www.space.com/home/feed/site.xml": { "categories": ["en_technologie"]},
- "http://feeds.feedburner.com/d0od?format=xml": { "categories": ["en_linux"]}
+ "http://feeds.feedburner.com/d0od?format=xml": { "categories": ["en_linux"]},
+ "https://www.debian.org/News/news": { "categories": ["en_linux"]},
+ "http://www.markshuttleworth.com/feed": { "categories": ["en_linux"]},
+ "https://insights.ubuntu.com/feed/": { "categories": ["en_linux"]},
+ "http://feeds.feedburner.com/LinuxJournal-BreakingNews?format=xml": { "categories": ["en_linux"]},
+ "https://www.gnome.org/feed/": { "categories": ["en_linux"]},
+ "http://linuxreviews.org/en.rss": { "categories": ["en_linux"]},
+ "http://www.linux-magazine.com/rss/feed/lmi_news": { "categories": ["en_linux"]}
+ },
+ "entities": {
+ "blacklist": [
+ "CNET",
+ "Read More",
+ "Digital Trends",
+ "Joey Sneddon",
+ "CA"
+ ]
}
}