From aff83c8798602b535d13edeaffdb8f4238e2bbf5 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Orsini Date: Mon, 6 Nov 2017 12:16:15 +0100 Subject: [PATCH] cleanup and refactored to move to net.wpitchoune package --- .../main/java/net/wpitchoune/pnews/Article.java | 69 ++++++ .../java/net/wpitchoune/pnews/ArticleFactory.java | 6 + .../java/net/wpitchoune/pnews/ArticleStore.java | 70 ++++++ .../main/java/net/wpitchoune/pnews/Category.java | 40 ++++ war/src/main/java/net/wpitchoune/pnews/Config.java | 234 +++++++++++++++++++ .../main/java/net/wpitchoune/pnews/EntityStat.java | 27 +++ war/src/main/java/net/wpitchoune/pnews/Feed.java | 24 ++ .../main/java/net/wpitchoune/pnews/Language.java | 21 ++ .../pnews/classifier/NamedEntityRecognizer.java | 51 ++++ .../net/wpitchoune/pnews/classifier/OpenNLP.java | 101 ++++++++ war/src/main/java/pnews/Article.java | 41 ---- war/src/main/java/pnews/Category.java | 40 ---- war/src/main/java/pnews/EntityStat.java | 27 --- war/src/main/java/pnews/Feed.java | 24 -- war/src/main/java/pnews/Language.java | 21 -- war/src/main/java/pnews/NER.java | 51 ---- war/src/main/java/pnews/OpenNLP.java | 101 -------- .../main/java/pnews/servlet/ArticleFactory.java | 8 - .../main/java/pnews/servlet/ArticleProvider.java | 27 +-- war/src/main/java/pnews/servlet/ArticleStore.java | 72 ------ war/src/main/java/pnews/servlet/Config.java | 259 --------------------- war/src/main/java/pnews/servlet/HTML.java | 25 +- war/src/main/java/pnews/servlet/JSON.java | 10 +- war/src/main/java/pnews/servlet/Pnews.java | 10 +- war/src/main/resources/feeds.json | 1 + 25 files changed, 683 insertions(+), 677 deletions(-) create mode 100644 war/src/main/java/net/wpitchoune/pnews/Article.java create mode 100644 war/src/main/java/net/wpitchoune/pnews/ArticleFactory.java create mode 100644 war/src/main/java/net/wpitchoune/pnews/ArticleStore.java create mode 100644 war/src/main/java/net/wpitchoune/pnews/Category.java create mode 100644 war/src/main/java/net/wpitchoune/pnews/Config.java create mode 100644 war/src/main/java/net/wpitchoune/pnews/EntityStat.java create mode 100644 war/src/main/java/net/wpitchoune/pnews/Feed.java create mode 100644 war/src/main/java/net/wpitchoune/pnews/Language.java create mode 100644 war/src/main/java/net/wpitchoune/pnews/classifier/NamedEntityRecognizer.java create mode 100644 war/src/main/java/net/wpitchoune/pnews/classifier/OpenNLP.java delete mode 100644 war/src/main/java/pnews/Article.java delete mode 100644 war/src/main/java/pnews/Category.java delete mode 100644 war/src/main/java/pnews/EntityStat.java delete mode 100644 war/src/main/java/pnews/Feed.java delete mode 100644 war/src/main/java/pnews/Language.java delete mode 100644 war/src/main/java/pnews/NER.java delete mode 100644 war/src/main/java/pnews/OpenNLP.java delete mode 100644 war/src/main/java/pnews/servlet/ArticleFactory.java delete mode 100644 war/src/main/java/pnews/servlet/ArticleStore.java delete mode 100644 war/src/main/java/pnews/servlet/Config.java diff --git a/war/src/main/java/net/wpitchoune/pnews/Article.java b/war/src/main/java/net/wpitchoune/pnews/Article.java new file mode 100644 index 0000000..9445a24 --- /dev/null +++ b/war/src/main/java/net/wpitchoune/pnews/Article.java @@ -0,0 +1,69 @@ +package net.wpitchoune.pnews; + +import java.time.Instant; +import java.util.concurrent.atomic.AtomicLong; + +public class Article { + private final String title; + private final String description; + private final String thumbnail; + private final String link; + private final Instant publicationDate; + private final String website; + private final String[] entities; + private final AtomicLong readCount = new AtomicLong(); + + public Article(String link, String title, String description, String thumbnail, Instant publicationDate, String website, String[] entities) { + this.link = link; + this.title = title; + this.description = description; + this.thumbnail = thumbnail; + this.publicationDate = publicationDate; + this.website = website; + this.entities = entities; + } + + public String getTitle() { + return title; + } + + public String getDescription() { + return description; + } + + public String getLink() { + return link; + } + + public String getThumbnail() { + return thumbnail; + } + + public String getWebsite() { + return website; + } + + public long getReadCount() { + return readCount.get(); + } + + public String[] getEntities() { + return entities; + } + + public boolean hasEntity(String entity) { + for (String e: entities) + if (e.equals(entity)) + return true; + + return false; + } + + public Instant getPublicationDate() { + return publicationDate; + } + + public void incrementReadCount() { + readCount.incrementAndGet(); + } +} diff --git a/war/src/main/java/net/wpitchoune/pnews/ArticleFactory.java b/war/src/main/java/net/wpitchoune/pnews/ArticleFactory.java new file mode 100644 index 0000000..f279693 --- /dev/null +++ b/war/src/main/java/net/wpitchoune/pnews/ArticleFactory.java @@ -0,0 +1,6 @@ +package net.wpitchoune.pnews; + +@FunctionalInterface +public interface ArticleFactory { + Article create(); +} diff --git a/war/src/main/java/net/wpitchoune/pnews/ArticleStore.java b/war/src/main/java/net/wpitchoune/pnews/ArticleStore.java new file mode 100644 index 0000000..f72bc16 --- /dev/null +++ b/war/src/main/java/net/wpitchoune/pnews/ArticleStore.java @@ -0,0 +1,70 @@ +package net.wpitchoune.pnews; + +import java.lang.ref.WeakReference; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.logging.Logger; + +public class ArticleStore { + public final static ArticleStore singleton = new ArticleStore(); + private final HashMap> articles = new HashMap<>(); + private final static Logger LOG = Logger.getLogger(ArticleStore.class.getName()); + + public Article get(String link) { + WeakReference
ref; + Article result; + + synchronized (articles) { + ref = articles.get(link); + if (ref == null) { + return null; + } + + result = ref.get(); + if (result == null) { + LOG.info("Article has been released from memory: " + link); + articles.remove(link); + } + + return result; + } + } + + public Article getArticle(String link, ArticleFactory factory) { + Article result; + + synchronized (articles) { + result = get(link); + if (result == null) { + result = factory.create(); + articles.put(link, new WeakReference<>(result)); + LOG.fine("Article has been added: " + result.getLink()); + } + + return result; + } + } + + public long size() { + synchronized (this) { + return articles.size(); + } + } + + public Article[] getArticles() { + ArrayList
result; + Article a; + + synchronized (articles) { + result = new ArrayList<>(articles.size()); + + for (WeakReference
r: articles.values()) { + a = r.get(); + if (a != null) + result.add(a); + } + + return result.toArray(new Article[] {}); + } + } +} diff --git a/war/src/main/java/net/wpitchoune/pnews/Category.java b/war/src/main/java/net/wpitchoune/pnews/Category.java new file mode 100644 index 0000000..84bd6d5 --- /dev/null +++ b/war/src/main/java/net/wpitchoune/pnews/Category.java @@ -0,0 +1,40 @@ +package net.wpitchoune.pnews; + +public class Category { + private final String id; + private final String label; + private final String title; + private final String language; + + public Category(String id, String label, String title, String language) { + this.id = id; + this.label = label; + this.title = title; + this.language = language; + } + + public String getTitle() { + return title; + } + + public String getURL() { + return "/" + language + "/" + id.toLowerCase(); + } + + public String getLabel() { + return label; + } + + public String getId() { + return id; + } + + public String getLanguage() { + return language; + } + + @Override + public String toString() { + return getLabel(); + } +} diff --git a/war/src/main/java/net/wpitchoune/pnews/Config.java b/war/src/main/java/net/wpitchoune/pnews/Config.java new file mode 100644 index 0000000..99ebb3f --- /dev/null +++ b/war/src/main/java/net/wpitchoune/pnews/Config.java @@ -0,0 +1,234 @@ +package net.wpitchoune.pnews; + +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.UnsupportedEncodingException; +import java.time.Instant; +import java.time.temporal.ChronoUnit; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.logging.Level; +import java.util.logging.Logger; + +import javax.json.Json; +import javax.json.JsonArray; +import javax.json.JsonObject; +import javax.json.JsonString; +import javax.json.JsonValue; + +public class Config { + private Feed[] feeds; + private Category[] categories; + private Language[] languages; + private final Set blacklistedEntities = new HashSet<>(); + private final HashMap entityAliases = new HashMap<>(); + private static final String CLASS_NAME = Config.class.getName(); + + /** + * The key is the language, the value is the default category for this language. + */ + private Map defaultCategories = new HashMap<>(); + private static final Logger LOG = Logger.getLogger(CLASS_NAME); + + private void loadCategories(JsonArray jcats) { + List cats; + JsonObject jcat; + Category cat; + String id, label, title, language; + + cats = new ArrayList<>(jcats.size()); + + for (JsonValue v: jcats) { + jcat = (JsonObject)v; + id = jcat.getString("id"); + label = jcat.getString("label"); + title = jcat.getString("title"); + language = jcat.getString("language"); + cat = new Category(id, label, title, language); + cats.add(cat); + if (defaultCategories.get(language) == null) + defaultCategories.put(language, cat); + } + + categories = cats.toArray(new Category[0]); + } + + private void loadLanguages(JsonArray jlangs) { + List langs; + JsonObject jlang; + String id; + + langs = new ArrayList<>(jlangs.size()); + + for (JsonValue v: jlangs) { + jlang = (JsonObject)v; + id = jlang.getString("id"); + langs.add(new Language(id)); + } + + languages = langs.toArray(new Language[0]); + } + + private Category getCategory(String id) { + for (Category c: categories) + if (c.getId().equals(id)) + return c; + return null; + } + + private void loadEntities(JsonObject jroot) { + JsonObject jentities, jaliases; + JsonArray jblacklist; + final String METHOD_NAME = "loadEntities"; + + jentities = jroot.getJsonObject("entities"); + + jblacklist = jentities.getJsonArray("blacklist"); + jblacklist.forEach((jv)-> { + JsonString js; + + js = (JsonString)jv; + blacklistedEntities.add(js.getString()); + }); + + jaliases = jentities.getJsonObject("aliases"); + jaliases.forEach((k, v)-> { + JsonArray jsources = (JsonArray)v; + + jsources.forEach((jsource)-> { + entityAliases.put(((JsonString)jsource).getString(), k); + }); + }); + + LOG.logp(Level.FINEST, CLASS_NAME, METHOD_NAME, " blacklistedEntities=" + blacklistedEntities); + LOG.logp(Level.FINEST, CLASS_NAME, METHOD_NAME, " entityAliases=" + entityAliases); + } + + public String getEntityAlias(String entity) { + String result; + + result = entityAliases.get(entity); + + if (result == null) + return entity; + else + return result; + } + + public void loadConfig() throws UnsupportedEncodingException { + Reader r; + JsonObject jfeeds, jroot; + List feedList; + + r = null; + try { + r = new InputStreamReader(Config.class.getClassLoader().getResourceAsStream("feeds.json"), + "UTF-8"); + jroot = Json.createReader(r).readObject(); + } finally { + if (r != null) + try { r.close(); } catch (IOException e) { }; + } + + loadLanguages(jroot.getJsonArray("languages")); + loadCategories(jroot.getJsonArray("categories")); + + jfeeds = jroot.getJsonObject("feeds"); + + feedList = new ArrayList(jfeeds.size()); + + jfeeds.forEach((k, v)-> { + JsonObject jf; + String str; + Category cat; + JsonArray jcategories; + + jf = (JsonObject)v; + jcategories = jf.getJsonArray("categories"); + str = jcategories.getString(0); + + cat = getCategory(str); + + if (cat != null) + feedList.add(new Feed(k, cat)); + else + LOG.severe("Missing category: " + str); + }); + + feeds = feedList.toArray(new Feed[0]); + + loadEntities(jroot); + } + + public boolean isBlacklistedEntity(String e) { + final String METHOD_NAME = "isBlacklistedEntity"; + boolean result; + + LOG.entering(CLASS_NAME, METHOD_NAME, e); + + result = blacklistedEntities.contains(e); + + LOG.exiting(CLASS_NAME, METHOD_NAME, result); + + return result; + } + + public boolean isObsolete(Instant instant) { + Instant olderInstant; + + olderInstant = Instant.now().minus(60, ChronoUnit.DAYS); + + if (instant.isAfter(olderInstant)) + return false; + else + return true; + } + + public Feed[] getFeeds() { + return feeds; + } + + public Map> getFeedsByCategory() { + Map> result; + Feed[] feeds; + List catFeeds; + Category cat; + + result = new HashMap<>(); + + feeds = getFeeds(); + for (Feed f: feeds) { + cat = f.getCategory(); + + catFeeds = result.get(cat); + if (catFeeds == null) { + catFeeds = new ArrayList(); + result.put(cat, catFeeds); + } + catFeeds.add(f); + } + + return result; + } + + public Category[] getCategories() { + return categories; + } + + public Category getDefaultCategory(Language lang) { + return defaultCategories.get(lang.getId()); + } + + public Language[] getLanguages() { + return languages; + } + + public Language getDefaultLanguage() { + return languages[0]; + } +} diff --git a/war/src/main/java/net/wpitchoune/pnews/EntityStat.java b/war/src/main/java/net/wpitchoune/pnews/EntityStat.java new file mode 100644 index 0000000..8468b01 --- /dev/null +++ b/war/src/main/java/net/wpitchoune/pnews/EntityStat.java @@ -0,0 +1,27 @@ +package net.wpitchoune.pnews; + +public class EntityStat { + private final String entity; + private int count; + + public EntityStat(String entity) { + this.entity = entity; + } + + public void increment() { + count++; + } + + public int getCount() { + return count; + } + + public String getEntity() { + return entity; + } + + @Override + public String toString() { + return entity + "(" + count + ")"; + } +} diff --git a/war/src/main/java/net/wpitchoune/pnews/Feed.java b/war/src/main/java/net/wpitchoune/pnews/Feed.java new file mode 100644 index 0000000..d85f14a --- /dev/null +++ b/war/src/main/java/net/wpitchoune/pnews/Feed.java @@ -0,0 +1,24 @@ +package net.wpitchoune.pnews; + +public class Feed { + private final String URL; + private final Category category; + + public Feed(String URL, Category category) { + this.URL = URL; + this.category = category; + } + + public String getURL() { + return URL; + } + + public Category getCategory() { + return category; + } + + @Override + public String toString() { + return getURL() + "[" + category + "]"; + } +} diff --git a/war/src/main/java/net/wpitchoune/pnews/Language.java b/war/src/main/java/net/wpitchoune/pnews/Language.java new file mode 100644 index 0000000..89e58e2 --- /dev/null +++ b/war/src/main/java/net/wpitchoune/pnews/Language.java @@ -0,0 +1,21 @@ +package net.wpitchoune.pnews; + +public class Language { + private final String id; + + public Language(String id) { + this.id = id; + } + + public String toURL() { + return "/" + id; + } + + public String getLabel() { + return id; + } + + public String getId() { + return id; + } +} diff --git a/war/src/main/java/net/wpitchoune/pnews/classifier/NamedEntityRecognizer.java b/war/src/main/java/net/wpitchoune/pnews/classifier/NamedEntityRecognizer.java new file mode 100644 index 0000000..0f9ee73 --- /dev/null +++ b/war/src/main/java/net/wpitchoune/pnews/classifier/NamedEntityRecognizer.java @@ -0,0 +1,51 @@ +package net.wpitchoune.pnews.classifier; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Logger; + +import edu.stanford.nlp.ie.crf.CRFClassifier; +import edu.stanford.nlp.util.CoreMap; +import edu.stanford.nlp.util.Triple; +import net.wpitchoune.pnews.Config; + +/** https://stanfordnlp.github.io/CoreNLP/api.html */ +public class NamedEntityRecognizer { + private static final String CLASS_NAME = NamedEntityRecognizer.class.getName(); + private static final Logger LOG = Logger.getLogger(CLASS_NAME); + private static final CRFClassifier classifier = CRFClassifier.getDefaultClassifier(); + + public static List classify(String str, List entities, Config config) throws ClassCastException, ClassNotFoundException, IOException { + + List> triples; + String w; + final String FUNCTION_NAME = "classify"; + + LOG.entering(CLASS_NAME, FUNCTION_NAME, str); + + OpenNLP.classify(str, entities, config); + + synchronized (classifier) { + triples = classifier.classifyToCharacterOffsets(str); + } + + for (Triple t: triples) { + w = str.substring(t.second, t.third); + if (!config.isBlacklistedEntity(w) && !entities.contains(w)) + entities.add(config.getEntityAlias(w)); + } + + LOG.exiting(CLASS_NAME, FUNCTION_NAME, entities); + + return entities; + } + + public static void main(String[] args) throws Exception { + List lst; + + lst = classify("I live in Washington and New York in United States.", new ArrayList<>(), new Config()); + for (String str: lst) + System.out.println(str); + } +} \ No newline at end of file diff --git a/war/src/main/java/net/wpitchoune/pnews/classifier/OpenNLP.java b/war/src/main/java/net/wpitchoune/pnews/classifier/OpenNLP.java new file mode 100644 index 0000000..ff9707d --- /dev/null +++ b/war/src/main/java/net/wpitchoune/pnews/classifier/OpenNLP.java @@ -0,0 +1,101 @@ +package net.wpitchoune.pnews.classifier; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; +import java.util.logging.Logger; + +import net.wpitchoune.pnews.Config; +import opennlp.tools.namefind.NameFinderME; +import opennlp.tools.namefind.TokenNameFinderModel; +import opennlp.tools.tokenize.TokenizerME; +import opennlp.tools.tokenize.TokenizerModel; +import opennlp.tools.util.Span; + +/** http://www.devglan.com/artificial-intelligence/opennlp-named-entity-recognition-example **/ +public class OpenNLP { + private static final String CLASS_NAME = OpenNLP.class.getName(); + private static final Logger LOG = Logger.getLogger(CLASS_NAME); + private static TokenNameFinderModel organizationModel; + private static TokenNameFinderModel personModel; + private static TokenNameFinderModel locationModel; + private static TokenizerModel tokenModel; + + public static List classify(String str, List entities, Config config) throws IOException { + classify(str, getOrganizationModel(), entities, config); + classify(str, getPersonModel(), entities, config); + classify(str, getLocationModel(), entities, config); + + return entities; + } + + private static TokenNameFinderModel getOrganizationModel() throws IOException { + synchronized (OpenNLP.class) { + if (organizationModel == null) { + InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-organization.bin"); + organizationModel = new TokenNameFinderModel(inputStream); + } + } + + return organizationModel; + } + + private static TokenNameFinderModel getPersonModel() throws IOException { + synchronized (OpenNLP.class) { + if (personModel == null) { + InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-person.bin"); + personModel = new TokenNameFinderModel(inputStream); + } + } + + return personModel; + } + + private static TokenNameFinderModel getLocationModel() throws IOException { + synchronized (OpenNLP.class) { + if (locationModel == null) { + InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-location.bin"); + locationModel = new TokenNameFinderModel(inputStream); + } + } + + return locationModel; + } + + private static List classify(String str, TokenNameFinderModel model, List entities, Config config) throws IOException { + String entity; + + NameFinderME nameFinder = new NameFinderME(model); + String[] tokens = tokenize(str); + Span nameSpans[] = nameFinder.find(tokens); + + for(Span s: nameSpans) { + if (s.getProb() < 0.60) + continue ; + + entity = null; + for (int i = s.getStart(); i < s.getEnd(); i++) + if (entity == null) + entity = tokens[i]; + else + entity += " " + tokens[i]; + + LOG.finest(entity + " " + s.getProb() + " " + s.toString()); + if (!config.isBlacklistedEntity(entity) && !entities.contains(entity)) + entities.add(config.getEntityAlias(entity)); + } + + return entities; + } + + private static String[] tokenize(String sentence) throws IOException { + synchronized (OpenNLP.class) { + if (tokenModel == null) { + InputStream inputStreamTokenizer = OpenNLP.class.getResourceAsStream("/en-token.bin"); + tokenModel = new TokenizerModel(inputStreamTokenizer); + } + } + TokenizerME tokenizer = new TokenizerME(tokenModel); + return tokenizer.tokenize(sentence); + } +} diff --git a/war/src/main/java/pnews/Article.java b/war/src/main/java/pnews/Article.java deleted file mode 100644 index 5b2e21c..0000000 --- a/war/src/main/java/pnews/Article.java +++ /dev/null @@ -1,41 +0,0 @@ -package pnews; - -import java.time.Instant; -import java.util.concurrent.atomic.AtomicLong; - -public class Article { - public final String title; - public final String description; - public final String thumbnail; - public final String link; - public final Instant publicationDate; - public final String website; - public final String[] entities; - public final AtomicLong readCount = new AtomicLong(); - - public Article(String link, String title, String description, String thumbnail, Instant publicationDate, String website, String[] entities) { - this.link = link; - this.title = title; - this.description = description; - this.thumbnail = thumbnail; - this.publicationDate = publicationDate; - this.website = website; - this.entities = entities; - } - - public String[] getEntities() { - return entities; - } - - public boolean hasEntity(String entity) { - for (String e: entities) - if (e.equals(entity)) - return true; - - return false; - } - - public Instant getPublicationDate() { - return publicationDate; - } -} diff --git a/war/src/main/java/pnews/Category.java b/war/src/main/java/pnews/Category.java deleted file mode 100644 index ac59c59..0000000 --- a/war/src/main/java/pnews/Category.java +++ /dev/null @@ -1,40 +0,0 @@ -package pnews; - -public class Category { - private final String id; - private final String label; - private final String title; - private final String language; - - public Category(String id, String label, String title, String language) { - this.id = id; - this.label = label; - this.title = title; - this.language = language; - } - - public String getTitle() { - return title; - } - - public String getURL() { - return "/" + language + "/" + id.toLowerCase(); - } - - public String getLabel() { - return label; - } - - public String getId() { - return id; - } - - public String getLanguage() { - return language; - } - - @Override - public String toString() { - return getLabel(); - } -} diff --git a/war/src/main/java/pnews/EntityStat.java b/war/src/main/java/pnews/EntityStat.java deleted file mode 100644 index a2fcb25..0000000 --- a/war/src/main/java/pnews/EntityStat.java +++ /dev/null @@ -1,27 +0,0 @@ -package pnews; - -public class EntityStat { - private final String entity; - private int count; - - public EntityStat(String entity) { - this.entity = entity; - } - - public void increment() { - count++; - } - - public int getCount() { - return count; - } - - public String getEntity() { - return entity; - } - - @Override - public String toString() { - return entity + "(" + count + ")"; - } -} diff --git a/war/src/main/java/pnews/Feed.java b/war/src/main/java/pnews/Feed.java deleted file mode 100644 index b5e677e..0000000 --- a/war/src/main/java/pnews/Feed.java +++ /dev/null @@ -1,24 +0,0 @@ -package pnews; - -public class Feed { - private final String URL; - private final Category category; - - public Feed(String URL, Category category) { - this.URL = URL; - this.category = category; - } - - public String getURL() { - return URL; - } - - public Category getCategory() { - return category; - } - - @Override - public String toString() { - return getURL() + "[" + category + "]"; - } -} diff --git a/war/src/main/java/pnews/Language.java b/war/src/main/java/pnews/Language.java deleted file mode 100644 index 71bc6aa..0000000 --- a/war/src/main/java/pnews/Language.java +++ /dev/null @@ -1,21 +0,0 @@ -package pnews; - -public class Language { - private final String id; - - public Language(String id) { - this.id = id; - } - - public String toURL() { - return "/" + id; - } - - public String getLabel() { - return id; - } - - public String getId() { - return id; - } -} diff --git a/war/src/main/java/pnews/NER.java b/war/src/main/java/pnews/NER.java deleted file mode 100644 index 5e7ce29..0000000 --- a/war/src/main/java/pnews/NER.java +++ /dev/null @@ -1,51 +0,0 @@ -package pnews; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.logging.Logger; - -import edu.stanford.nlp.ie.crf.CRFClassifier; -import edu.stanford.nlp.util.CoreMap; -import edu.stanford.nlp.util.Triple; -import pnews.servlet.Config; - -/** https://stanfordnlp.github.io/CoreNLP/api.html */ -public class NER { - private static final String CLASS_NAME = NER.class.getName(); - private static final Logger LOG = Logger.getLogger(CLASS_NAME); - private static final CRFClassifier classifier = CRFClassifier.getDefaultClassifier(); - - public static List classify(String str, List entities, Config config) throws ClassCastException, ClassNotFoundException, IOException { - - List> triples; - String w; - final String FUNCTION_NAME = "classify"; - - LOG.entering(CLASS_NAME, FUNCTION_NAME, str); - - OpenNLP.classify(str, entities, config); - - synchronized (classifier) { - triples = classifier.classifyToCharacterOffsets(str); - } - - for (Triple t: triples) { - w = str.substring(t.second, t.third); - if (!config.isBlacklistedEntity(w) && !entities.contains(w)) - entities.add(config.getEntityAlias(w)); - } - - LOG.exiting(CLASS_NAME, FUNCTION_NAME, entities); - - return entities; - } - - public static void main(String[] args) throws Exception { - List lst; - - lst = classify("I live in Washington and New York in United States.", new ArrayList<>(), new Config()); - for (String str: lst) - System.out.println(str); - } -} \ No newline at end of file diff --git a/war/src/main/java/pnews/OpenNLP.java b/war/src/main/java/pnews/OpenNLP.java deleted file mode 100644 index 99e344d..0000000 --- a/war/src/main/java/pnews/OpenNLP.java +++ /dev/null @@ -1,101 +0,0 @@ -package pnews; - -import java.io.IOException; -import java.io.InputStream; -import java.util.List; -import java.util.logging.Logger; - -import opennlp.tools.namefind.NameFinderME; -import opennlp.tools.namefind.TokenNameFinderModel; -import opennlp.tools.tokenize.TokenizerME; -import opennlp.tools.tokenize.TokenizerModel; -import opennlp.tools.util.Span; -import pnews.servlet.Config; - -/** http://www.devglan.com/artificial-intelligence/opennlp-named-entity-recognition-example **/ -public class OpenNLP { - private static final String CLASS_NAME = OpenNLP.class.getName(); - private static final Logger LOG = Logger.getLogger(CLASS_NAME); - private static TokenNameFinderModel organizationModel; - private static TokenNameFinderModel personModel; - private static TokenNameFinderModel locationModel; - private static TokenizerModel tokenModel; - - public static List classify(String str, List entities, Config config) throws IOException { - classify(str, getOrganizationModel(), entities, config); - classify(str, getPersonModel(), entities, config); - classify(str, getLocationModel(), entities, config); - - return entities; - } - - private static TokenNameFinderModel getOrganizationModel() throws IOException { - synchronized (OpenNLP.class) { - if (organizationModel == null) { - InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-organization.bin"); - organizationModel = new TokenNameFinderModel(inputStream); - } - } - - return organizationModel; - } - - private static TokenNameFinderModel getPersonModel() throws IOException { - synchronized (OpenNLP.class) { - if (personModel == null) { - InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-person.bin"); - personModel = new TokenNameFinderModel(inputStream); - } - } - - return personModel; - } - - private static TokenNameFinderModel getLocationModel() throws IOException { - synchronized (OpenNLP.class) { - if (locationModel == null) { - InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-location.bin"); - locationModel = new TokenNameFinderModel(inputStream); - } - } - - return locationModel; - } - - private static List classify(String str, TokenNameFinderModel model, List entities, Config config) throws IOException { - String entity; - - NameFinderME nameFinder = new NameFinderME(model); - String[] tokens = tokenize(str); - Span nameSpans[] = nameFinder.find(tokens); - - for(Span s: nameSpans) { - if (s.getProb() < 0.60) - continue ; - - entity = null; - for (int i = s.getStart(); i < s.getEnd(); i++) - if (entity == null) - entity = tokens[i]; - else - entity += " " + tokens[i]; - - LOG.finest(entity + " " + s.getProb() + " " + s.toString()); - if (!config.isBlacklistedEntity(entity) && !entities.contains(entity)) - entities.add(config.getEntityAlias(entity)); - } - - return entities; - } - - public static String[] tokenize(String sentence) throws IOException { - synchronized (OpenNLP.class) { - if (tokenModel == null) { - InputStream inputStreamTokenizer = OpenNLP.class.getResourceAsStream("/en-token.bin"); - tokenModel = new TokenizerModel(inputStreamTokenizer); - } - } - TokenizerME tokenizer = new TokenizerME(tokenModel); - return tokenizer.tokenize(sentence); - } -} diff --git a/war/src/main/java/pnews/servlet/ArticleFactory.java b/war/src/main/java/pnews/servlet/ArticleFactory.java deleted file mode 100644 index b89bebb..0000000 --- a/war/src/main/java/pnews/servlet/ArticleFactory.java +++ /dev/null @@ -1,8 +0,0 @@ -package pnews.servlet; - -import pnews.Article; - -@FunctionalInterface -public interface ArticleFactory { - Article create(); -} diff --git a/war/src/main/java/pnews/servlet/ArticleProvider.java b/war/src/main/java/pnews/servlet/ArticleProvider.java index bdba835..55898a4 100644 --- a/war/src/main/java/pnews/servlet/ArticleProvider.java +++ b/war/src/main/java/pnews/servlet/ArticleProvider.java @@ -27,11 +27,13 @@ import com.rometools.rome.io.FeedException; import com.rometools.rome.io.SyndFeedInput; import com.rometools.rome.io.XmlReader; -import pnews.Article; -import pnews.Category; -import pnews.EntityStat; -import pnews.Feed; -import pnews.NER; +import net.wpitchoune.pnews.Article; +import net.wpitchoune.pnews.ArticleStore; +import net.wpitchoune.pnews.Category; +import net.wpitchoune.pnews.Config; +import net.wpitchoune.pnews.EntityStat; +import net.wpitchoune.pnews.Feed; +import net.wpitchoune.pnews.classifier.NamedEntityRecognizer; public class ArticleProvider { private static final String CLASS_NAME = ArticleProvider.class.getName(); @@ -70,7 +72,7 @@ public class ArticleProvider { private boolean exists(String articleLink, List
articles) { synchronized (articles) { for (Article a: articles) - if (a.link.equals(articleLink)) + if (a.getLink().equals(articleLink)) return true; } return false; @@ -91,7 +93,6 @@ public class ArticleProvider { private Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang, Instant instant) { String desc, title, thumbnail, feedTitle, str; - Date date; List entities; feedTitle = feed.getTitle(); @@ -119,9 +120,9 @@ public class ArticleProvider { entities = new ArrayList<>(); if (lang.equals("en")) try { - NER.classify(title, entities, config); + NamedEntityRecognizer.classify(title, entities, config); if (desc != null) - NER.classify(desc, entities, config); + NamedEntityRecognizer.classify(desc, entities, config); } catch (ClassCastException | ClassNotFoundException | IOException e1) { LOG.log(Level.SEVERE, "Cannot classify " + feedTitle, e1); } @@ -159,13 +160,13 @@ public class ArticleProvider { Collections.sort(articles, new Comparator
() { @Override public int compare(Article o1, Article o2) { - if (o1.publicationDate == o2.publicationDate) + if (o1.getPublicationDate() == o2.getPublicationDate()) return 0; - if (o1.publicationDate == null) + if (o1.getPublicationDate() == null) return 1; - if (o2.publicationDate == null) + if (o2.getPublicationDate() == null) return -1; - return o2.publicationDate.compareTo(o1.publicationDate); + return o2.getPublicationDate().compareTo(o1.getPublicationDate()); } }); } diff --git a/war/src/main/java/pnews/servlet/ArticleStore.java b/war/src/main/java/pnews/servlet/ArticleStore.java deleted file mode 100644 index f60b819..0000000 --- a/war/src/main/java/pnews/servlet/ArticleStore.java +++ /dev/null @@ -1,72 +0,0 @@ -package pnews.servlet; - -import java.lang.ref.WeakReference; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.logging.Logger; - -import pnews.Article; - -public class ArticleStore { - public final static ArticleStore singleton = new ArticleStore(); - private final HashMap> articles = new HashMap<>(); - private final static Logger LOG = Logger.getLogger(ArticleStore.class.getName()); - - public Article get(String link) { - WeakReference
ref; - Article result; - - synchronized (articles) { - ref = articles.get(link); - if (ref == null) { - return null; - } - - result = ref.get(); - if (result == null) { - LOG.info("Article has been released from memory: " + link); - articles.remove(link); - } - - return result; - } - } - - public Article getArticle(String link, ArticleFactory factory) { - Article result; - - synchronized (articles) { - result = get(link); - if (result == null) { - result = factory.create(); - articles.put(link, new WeakReference<>(result)); - LOG.fine("Article has been added: " + result.link); - } - - return result; - } - } - - public long size() { - synchronized (this) { - return articles.size(); - } - } - - public Article[] getArticles() { - ArrayList
result; - Article a; - - synchronized (articles) { - result = new ArrayList<>(articles.size()); - - for (WeakReference
r: articles.values()) { - a = r.get(); - if (a != null) - result.add(a); - } - - return result.toArray(new Article[] {}); - } - } -} diff --git a/war/src/main/java/pnews/servlet/Config.java b/war/src/main/java/pnews/servlet/Config.java deleted file mode 100644 index 78db694..0000000 --- a/war/src/main/java/pnews/servlet/Config.java +++ /dev/null @@ -1,259 +0,0 @@ -package pnews.servlet; - -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.Reader; -import java.io.UnsupportedEncodingException; -import java.time.Instant; -import java.time.temporal.ChronoUnit; -import java.time.temporal.TemporalUnit; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.logging.Level; -import java.util.logging.Logger; - -import javax.json.Json; -import javax.json.JsonArray; -import javax.json.JsonObject; -import javax.json.JsonString; -import javax.json.JsonValue; - -import pnews.Article; -import pnews.Category; -import pnews.Feed; -import pnews.Language; - -public class Config { - private Feed[] feeds; - private Category[] categories; - private Language[] languages; - private final Set blacklistedEntities = new HashSet<>(); - private final HashMap entityAliases = new HashMap<>(); - private static final String CLASS_NAME = Config.class.getName(); - - /** - * The key is the language, the value is the default category for this language. - */ - private Map defaultCategories = new HashMap<>(); - private static final Logger LOG = Logger.getLogger(CLASS_NAME); - - private void loadCategories(JsonArray jcats) { - List cats; - JsonObject jcat; - Category cat; - String id, label, title, language; - - cats = new ArrayList<>(jcats.size()); - - for (JsonValue v: jcats) { - jcat = (JsonObject)v; - id = jcat.getString("id"); - label = jcat.getString("label"); - title = jcat.getString("title"); - language = jcat.getString("language"); - cat = new Category(id, label, title, language); - cats.add(cat); - if (defaultCategories.get(language) == null) - defaultCategories.put(language, cat); - } - - categories = cats.toArray(new Category[0]); - } - - private void loadLanguages(JsonArray jlangs) { - List langs; - JsonObject jlang; - String id; - - langs = new ArrayList<>(jlangs.size()); - - for (JsonValue v: jlangs) { - jlang = (JsonObject)v; - id = jlang.getString("id"); - langs.add(new Language(id)); - } - - languages = langs.toArray(new Language[0]); - } - - private Category getCategory(String id) { - for (Category c: categories) - if (c.getId().equals(id)) - return c; - return null; - } - - private void loadEntities(JsonObject jroot) { - JsonObject jentities, jaliases; - JsonArray jblacklist; - final String METHOD_NAME = "loadEntities"; - - jentities = jroot.getJsonObject("entities"); - - jblacklist = jentities.getJsonArray("blacklist"); - jblacklist.forEach((jv)-> { - JsonString js; - - js = (JsonString)jv; - blacklistedEntities.add(js.getString()); - }); - - jaliases = jentities.getJsonObject("aliases"); - jaliases.forEach((k, v)-> { - JsonArray jsources = (JsonArray)v; - - jsources.forEach((jsource)-> { - entityAliases.put(((JsonString)jsource).getString(), k); - }); - }); - - LOG.logp(Level.FINEST, CLASS_NAME, METHOD_NAME, " blacklistedEntities=" + blacklistedEntities); - LOG.logp(Level.FINEST, CLASS_NAME, METHOD_NAME, " entityAliases=" + entityAliases); - } - - public String getEntityAlias(String entity) { - String result; - - result = entityAliases.get(entity); - - if (result == null) - return entity; - else - return result; - } - - public void loadConfig() throws UnsupportedEncodingException { - Reader r; - JsonObject jfeeds, jroot; - List feedList; - - r = null; - try { - r = new InputStreamReader(Config.class.getClassLoader().getResourceAsStream("feeds.json"), - "UTF-8"); - jroot = Json.createReader(r).readObject(); - } finally { - if (r != null) - try { r.close(); } catch (IOException e) { }; - } - - loadLanguages(jroot.getJsonArray("languages")); - loadCategories(jroot.getJsonArray("categories")); - - jfeeds = jroot.getJsonObject("feeds"); - - feedList = new ArrayList(jfeeds.size()); - - jfeeds.forEach((k, v)-> { - JsonObject jf; - String str; - Category cat; - JsonArray jcategories; - - jf = (JsonObject)v; - jcategories = jf.getJsonArray("categories"); - str = jcategories.getString(0); - - cat = getCategory(str); - - if (cat != null) - feedList.add(new Feed(k, cat)); - else - LOG.severe("Missing category: " + str); - }); - - feeds = feedList.toArray(new Feed[0]); - - loadEntities(jroot); - } - - public boolean isBlacklistedEntity(String e) { - final String METHOD_NAME = "isBlacklistedEntity"; - boolean result; - - LOG.entering(CLASS_NAME, METHOD_NAME, e); - - result = blacklistedEntities.contains(e); - - LOG.exiting(CLASS_NAME, METHOD_NAME, result); - - return result; - } - - public boolean isObsolete(Instant instant) { - Instant olderInstant; - - olderInstant = Instant.now().minus(60, ChronoUnit.DAYS); - - if (instant.isAfter(olderInstant)) - return false; - else - return true; - } - - public Feed[] getFeeds() { - return feeds; - } - - public Map> getFeedsByCategory() { - Map> result; - Feed[] feeds; - List catFeeds; - Category cat; - - result = new HashMap<>(); - - feeds = getFeeds(); - for (Feed f: feeds) { - cat = f.getCategory(); - - catFeeds = result.get(cat); - if (catFeeds == null) { - catFeeds = new ArrayList(); - result.put(cat, catFeeds); - } - catFeeds.add(f); - } - - return result; - } - - public Category[] getCategories() { - return categories; - } - - public Category getDefaultCategory(Language lang) { - return defaultCategories.get(lang.getId()); - } - - public Language[] getLanguages() { - return languages; - } - - public Language getDefaultLanguage() { - return languages[0]; - } - - public static void main(String[] args) throws UnsupportedEncodingException { - Config cfg; - Feed[] feeds; - Category[] cats; - - cfg = new Config(); - cfg.loadConfig(); - - cats = cfg.getCategories(); - for (Category cat: cats) - System.out.println(cat); - - feeds = cfg.getFeeds(); - - System.out.println("Number of feeds: " + feeds.length); - for (Feed f: feeds) - System.out.println(f); - } -} diff --git a/war/src/main/java/pnews/servlet/HTML.java b/war/src/main/java/pnews/servlet/HTML.java index d20d415..ac7a749 100644 --- a/war/src/main/java/pnews/servlet/HTML.java +++ b/war/src/main/java/pnews/servlet/HTML.java @@ -9,10 +9,11 @@ import java.util.logging.Logger; import com.rometools.rome.io.FeedException; -import pnews.Article; -import pnews.Category; -import pnews.EntityStat; -import pnews.Language; +import net.wpitchoune.pnews.Article; +import net.wpitchoune.pnews.Category; +import net.wpitchoune.pnews.Config; +import net.wpitchoune.pnews.EntityStat; +import net.wpitchoune.pnews.Language; public class HTML { private static final String CLASS_NAME= HTML.class.getName(); @@ -48,9 +49,9 @@ public class HTML { buf.append("
\n"); buf.append("
\n"); - if (a.thumbnail != null) { + if (a.getThumbnail() != null) { buf.append("\n"); } buf.append("
\n"); @@ -58,18 +59,18 @@ public class HTML { buf.append("
\n"); buf.append("
\n"); - appendA(buf, a.title, "/redirect?url=" + URLEncoder.encode(a.link, "UTF-8"), null); + appendA(buf, a.getTitle(), "/redirect?url=" + URLEncoder.encode(a.getLink(), "UTF-8"), null); buf.append("
\n"); - buf.append(""); + buf.append(""); buf.append("
\n"); - if (a.description != null) { + if (a.getDescription() != null) { buf.append("

"); - if (a.description.length() < 512) { - buf.append(a.description); + if (a.getDescription().length() < 512) { + buf.append(a.getDescription()); } else { - buf.append(a.description.substring(0, 512)); + buf.append(a.getDescription().substring(0, 512)); buf.append("[..]"); } buf.append("

"); diff --git a/war/src/main/java/pnews/servlet/JSON.java b/war/src/main/java/pnews/servlet/JSON.java index bf6dcae..78379a4 100644 --- a/war/src/main/java/pnews/servlet/JSON.java +++ b/war/src/main/java/pnews/servlet/JSON.java @@ -9,8 +9,10 @@ import com.google.gson.Gson; import com.google.gson.JsonObject; import com.rometools.rome.io.FeedException; -import pnews.Article; -import pnews.Category; +import net.wpitchoune.pnews.Article; +import net.wpitchoune.pnews.ArticleStore; +import net.wpitchoune.pnews.Category; +import net.wpitchoune.pnews.Config; public class JSON { private static final Logger LOG = Logger.getLogger(JSON.class.getName()); @@ -30,8 +32,8 @@ public class JSON { allArticles = ArticleStore.singleton.getArticles(); for (Article a: allArticles) - if (a.readCount.get() > 0) - jreadcounts.addProperty(a.link, a.readCount); + if (a.getReadCount() > 0) + jreadcounts.addProperty(a.getLink(), a.getReadCount()); jcategories = new JsonObject(); jstats.add("categories", jcategories); diff --git a/war/src/main/java/pnews/servlet/Pnews.java b/war/src/main/java/pnews/servlet/Pnews.java index 69ad1bc..c51f946 100644 --- a/war/src/main/java/pnews/servlet/Pnews.java +++ b/war/src/main/java/pnews/servlet/Pnews.java @@ -20,9 +20,11 @@ import javax.servlet.http.HttpServletResponse; import com.rometools.rome.io.FeedException; -import pnews.Article; -import pnews.Category; -import pnews.Language; +import net.wpitchoune.pnews.Article; +import net.wpitchoune.pnews.ArticleStore; +import net.wpitchoune.pnews.Category; +import net.wpitchoune.pnews.Config; +import net.wpitchoune.pnews.Language; public class Pnews extends HttpServlet { private static final String CLASS_NAME = Pnews.class.getName(); @@ -72,7 +74,7 @@ public class Pnews extends HttpServlet { if (redirectURL != null) { a = ArticleStore.singleton.get(redirectURL); if (a != null) - a.readCount.incrementAndGet(); + a.incrementReadCount(); else LOG.severe("Cannot find the article " + redirectURL); diff --git a/war/src/main/resources/feeds.json b/war/src/main/resources/feeds.json index 03d4217..b309cd7 100644 --- a/war/src/main/resources/feeds.json +++ b/war/src/main/resources/feeds.json @@ -138,6 +138,7 @@ "https://opensource.com/feed": { "categories": ["en_linux"]}, "https://www.infoworld.com/category/linux/index.rss": { "categories": ["en_linux"]}, "https://www.infoworld.com/category/open-source-tools/index.rss": { "categories": ["en_linux"]}, + "https://www.techrepublic.com/rssfeeds/topic/open-source/": { "categories": ["en_linux"]}, "https://www.space.com/home/feed/site.xml": { "categories": ["en_space"]}, "https://www.nasa.gov/rss/dyn/breaking_news.rss": { "categories": ["en_space"]}, "http://www.esa.int/rssfeed/Our_Activities/Space_News": { "categories": ["en_space"]}, -- 2.7.4