From 9ff314621235d6b748abb128edf0331480d0eaaf Mon Sep 17 00:00:00 2001 From: Jean-Philippe Orsini Date: Sun, 5 Nov 2017 00:13:41 +0100 Subject: [PATCH] added support of entity aliases --- war/src/main/java/pnews/NER.java | 2 +- war/src/main/java/pnews/OpenNLP.java | 2 +- war/src/main/java/pnews/servlet/Config.java | 26 ++++++++++++++++++++++++-- war/src/main/resources/feeds.json | 5 ++++- 4 files changed, 30 insertions(+), 5 deletions(-) diff --git a/war/src/main/java/pnews/NER.java b/war/src/main/java/pnews/NER.java index 2055cf1..5e7ce29 100644 --- a/war/src/main/java/pnews/NER.java +++ b/war/src/main/java/pnews/NER.java @@ -33,7 +33,7 @@ public class NER { for (Triple t: triples) { w = str.substring(t.second, t.third); if (!config.isBlacklistedEntity(w) && !entities.contains(w)) - entities.add(w); + entities.add(config.getEntityAlias(w)); } LOG.exiting(CLASS_NAME, FUNCTION_NAME, entities); diff --git a/war/src/main/java/pnews/OpenNLP.java b/war/src/main/java/pnews/OpenNLP.java index c383cee..e158a00 100644 --- a/war/src/main/java/pnews/OpenNLP.java +++ b/war/src/main/java/pnews/OpenNLP.java @@ -97,7 +97,7 @@ public class OpenNLP { LOG.finest(entity + " " + s.getProb() + " " + s.toString()); if (!config.isBlacklistedEntity(entity) && !entities.contains(entity)) - entities.add(entity); + entities.add(config.getEntityAlias(entity)); } return entities; diff --git a/war/src/main/java/pnews/servlet/Config.java b/war/src/main/java/pnews/servlet/Config.java index fec3770..46e75f6 100644 --- a/war/src/main/java/pnews/servlet/Config.java +++ b/war/src/main/java/pnews/servlet/Config.java @@ -28,6 +28,7 @@ public class Config { private Category[] categories; private Language[] languages; private final Set blacklistedEntities = new HashSet<>(); + private final HashMap entityAliases = new HashMap<>(); private static final String CLASS_NAME = Config.class.getName(); /** @@ -83,13 +84,13 @@ public class Config { } private void loadEntities(JsonObject jroot) { - JsonObject jentities; + JsonObject jentities, jaliases; JsonArray jblacklist; final String METHOD_NAME = "loadEntities"; jentities = jroot.getJsonObject("entities"); - jblacklist = jentities.getJsonArray("blacklist"); + jblacklist = jentities.getJsonArray("blacklist"); jblacklist.forEach((jv)-> { JsonString js; @@ -97,7 +98,28 @@ public class Config { blacklistedEntities.add(js.getString()); }); + jaliases = jentities.getJsonObject("aliases"); + jaliases.forEach((k, v)-> { + JsonArray jsources = (JsonArray)v; + + jsources.forEach((jsource)-> { + entityAliases.put(((JsonString)jsource).getString(), k); + }); + }); + LOG.logp(Level.FINEST, CLASS_NAME, METHOD_NAME, " blacklistedEntities=" + blacklistedEntities); + LOG.logp(Level.FINEST, CLASS_NAME, METHOD_NAME, " entityAliases=" + entityAliases); + } + + public String getEntityAlias(String entity) { + String result; + + result = entityAliases.get(entity); + + if (result == null) + return entity; + else + return result; } public void loadConfig() throws UnsupportedEncodingException { diff --git a/war/src/main/resources/feeds.json b/war/src/main/resources/feeds.json index 7ec9e49..337e19a 100644 --- a/war/src/main/resources/feeds.json +++ b/war/src/main/resources/feeds.json @@ -146,6 +146,9 @@ "Digital Trends", "Joey Sneddon", "CA" - ] + ], + "aliases": { + "U.S.": ["United States", "US"] + } } } -- 2.7.4