From 52bcbff162b9fc88606d2e4e5195657f5040b261 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Orsini Date: Thu, 2 Nov 2017 17:30:54 +0100 Subject: [PATCH 01/16] do not use the feed image for the article, most are designed to be an icon and scale badly --- war/src/main/java/pnews/servlet/ArticleProvider.java | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/war/src/main/java/pnews/servlet/ArticleProvider.java b/war/src/main/java/pnews/servlet/ArticleProvider.java index 5efea15..4e85e83 100644 --- a/war/src/main/java/pnews/servlet/ArticleProvider.java +++ b/war/src/main/java/pnews/servlet/ArticleProvider.java @@ -90,11 +90,7 @@ public class ArticleProvider { thumbnail = e.getUrl(); break; } - - if (thumbnail == null && feed.getImage() != null) - thumbnail = feed.getImage().getUrl(); - - + title = entry.getTitle().trim(); if (entry.getDescription() != null) { -- 2.7.4 From 6a601c7b03d36f95fbb69ba487c8a4953828c71d Mon Sep 17 00:00:00 2001 From: Jean-Philippe Orsini Date: Thu, 2 Nov 2017 17:42:39 +0100 Subject: [PATCH 02/16] fixed url of a rss layout the topics horizontally --- war/src/main/java/pnews/servlet/HTML.java | 7 ++++--- war/src/main/resources/feeds.json | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/war/src/main/java/pnews/servlet/HTML.java b/war/src/main/java/pnews/servlet/HTML.java index 89ce7c5..42564e8 100644 --- a/war/src/main/java/pnews/servlet/HTML.java +++ b/war/src/main/java/pnews/servlet/HTML.java @@ -131,18 +131,19 @@ public class HTML { entities = provider.getEntityStats(catActive); if (entities.size() > 0) { - buf.append("Hot topics: "); + buf.append("\n"); } } catch (IllegalArgumentException | FeedException | IOException e2) { LOG.log(Level.SEVERE, "Failed to get entities", e2); diff --git a/war/src/main/resources/feeds.json b/war/src/main/resources/feeds.json index 0118278..aae327a 100644 --- a/war/src/main/resources/feeds.json +++ b/war/src/main/resources/feeds.json @@ -101,7 +101,7 @@ "https://www.techworld.com/news/rss": { "categories": ["en_technologie"]}, "http://feeds.feedburner.com/TechCrunch/": { "categories": ["en_technologie"]}, "http://feeds.macrumors.com/MacRumors-All": { "categories": ["en_technologie"]}, - "https://www.digitaltrends.com/rss-home/": { "categories": ["en_technologie"]}, + "https://www.digitaltrends.com/computing/feed/": { "categories": ["en_technologie"]}, "http://www.zdnet.com/news/rss.xml": { "categories": ["en_technologie"]}, "https://www.androidheadlines.com/feed": { "categories": ["en_technologie"]}, "https://www.nasa.gov/rss/dyn/breaking_news.rss": { "categories": ["en_technologie"]}, -- 2.7.4 From 511fe72ae8bad61a60f738c01819af58510ae9d3 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Orsini Date: Thu, 2 Nov 2017 23:41:22 +0100 Subject: [PATCH 03/16] navigation with hot topics --- war/src/main/java/pnews/Article.java | 8 ++++++++ .../main/java/pnews/servlet/ArticleProvider.java | 18 +++++++++++++----- war/src/main/java/pnews/servlet/HTML.java | 21 ++++++++++++++++----- war/src/main/java/pnews/servlet/JSON.java | 2 +- war/src/main/java/pnews/servlet/Pnews.java | 10 +++++----- war/src/main/resources/feeds.json | 6 +++++- 6 files changed, 48 insertions(+), 17 deletions(-) diff --git a/war/src/main/java/pnews/Article.java b/war/src/main/java/pnews/Article.java index f38f792..97e8b76 100644 --- a/war/src/main/java/pnews/Article.java +++ b/war/src/main/java/pnews/Article.java @@ -27,6 +27,14 @@ public class Article { return entities; } + public boolean hasEntity(String entity) { + for (String e: entities) + if (e.equals(entity)) + return true; + + return false; + } + public Date getPublicationDate() { return publicationDate; } diff --git a/war/src/main/java/pnews/servlet/ArticleProvider.java b/war/src/main/java/pnews/servlet/ArticleProvider.java index 4e85e83..c2d8f59 100644 --- a/war/src/main/java/pnews/servlet/ArticleProvider.java +++ b/war/src/main/java/pnews/servlet/ArticleProvider.java @@ -181,16 +181,24 @@ public class ArticleProvider { /** * Returns a copy. */ - public List
getArticles(Category cat) + public List
getArticles(Category cat, String entity) throws IllegalArgumentException, MalformedURLException, FeedException, IOException { - List
articles; + List
articles, result; synchronized (articlesByCategory) { articles = getArticlesForUpdate(cat); } - synchronized (articles) { - return new ArrayList<>(articles); + synchronized (articles) { + if (entity == null) + return new ArrayList<>(articles); + + result = new ArrayList<>(articles.size()); + for (Article a: articles) + if (a.hasEntity(entity)) + result.add(a); + + return result; } } @@ -203,7 +211,7 @@ public class ArticleProvider { LOG.entering(CLASS_NAME, FUNCTION_NAME, cat); - articles = getArticles(cat); + articles = getArticles(cat, null); entities = new HashMap<>(); for (Article a: articles) diff --git a/war/src/main/java/pnews/servlet/HTML.java b/war/src/main/java/pnews/servlet/HTML.java index 42564e8..72f0b4b 100644 --- a/war/src/main/java/pnews/servlet/HTML.java +++ b/war/src/main/java/pnews/servlet/HTML.java @@ -105,11 +105,20 @@ public class HTML { buf.append("\n"); } - public static String toHTML(List
articles, Category catActive, Config cfg, ArticleProvider provider) { + private static String toURL(Category catActive, String entity) { + try { + return catActive.getURL() + "?entity=" + URLEncoder.encode(entity, "UTF-8"); + } catch (UnsupportedEncodingException e) { + LOG.log(Level.SEVERE, "Failed to generate link to entity " + entity, e); + return catActive.getURL(); + } + } + + public static String toHTML(List
articles, Category catActive, String entityActive, Config cfg, ArticleProvider provider) { StringBuffer buf; int i; - Category[] cats; List entities; + String cl; buf = new StringBuffer(); buf.append("\n"); @@ -123,8 +132,6 @@ public class HTML { buf.append("\n"); buf.append("\n"); - cats = cfg.getCategories(); - appendMenu(buf, catActive, cfg); try { @@ -136,7 +143,11 @@ public class HTML { i = 0; for (EntityStat s: entities) { buf.append("
  • "); - buf.append(s.getEntity()); + if (entityActive != null && s.getEntity().equals(entityActive)) + cl = "active"; + else + cl = null; + appendA(buf, s.getEntity(), toURL(catActive, s.getEntity()), cl); buf.append("
  • \n"); i++; if (i > 10) diff --git a/war/src/main/java/pnews/servlet/JSON.java b/war/src/main/java/pnews/servlet/JSON.java index 13509ed..bf6dcae 100644 --- a/war/src/main/java/pnews/servlet/JSON.java +++ b/war/src/main/java/pnews/servlet/JSON.java @@ -38,7 +38,7 @@ public class JSON { for (Category cat: config.getCategories()) try { - articles = provider.getArticles(cat); + articles = provider.getArticles(cat, null); jcategories.addProperty(cat.getLabel(), articles.size()); } catch (IllegalArgumentException | FeedException | IOException e) { diff --git a/war/src/main/java/pnews/servlet/Pnews.java b/war/src/main/java/pnews/servlet/Pnews.java index 777f7fd..69ad1bc 100644 --- a/war/src/main/java/pnews/servlet/Pnews.java +++ b/war/src/main/java/pnews/servlet/Pnews.java @@ -105,20 +105,20 @@ public class Pnews extends HttpServlet { } - private void writeArticles(Category cat, HttpServletResponse rp) { + private void writeArticles(Category cat, String entity, HttpServletResponse rp) { String html; List
    articles; try { - articles = provider.getArticles(cat); + articles = provider.getArticles(cat, entity); if (articles != null) { - html = HTML.toHTML(articles, cat, config, provider); + html = HTML.toHTML(articles, cat, entity, config, provider); rp.setContentType("text/html;charset=utf-8"); rp.getWriter().write(html); rp.setCharacterEncoding("utf-8"); } else { LOG.severe("writeArticles cannot retrieve any articles"); - html = HTML.toHTML(new ArrayList
    (), cat, config, provider); + html = HTML.toHTML(new ArrayList
    (), cat, entity, config, provider); rp.setContentType("text/html"); rp.getWriter().write(html); } @@ -186,7 +186,7 @@ public class Pnews extends HttpServlet { for (Category cat: config.getCategories()) { if (path.equals(cat.getURL())) { - writeArticles(cat, resp); + writeArticles(cat, getQueryParameter(req, "entity"), resp); return ; } } diff --git a/war/src/main/resources/feeds.json b/war/src/main/resources/feeds.json index aae327a..91e2a1e 100644 --- a/war/src/main/resources/feeds.json +++ b/war/src/main/resources/feeds.json @@ -93,6 +93,9 @@ "http://www.futura-sciences.com/rss/actualites.xml": { "categories": ["technologie"] }, "https://www-03.ibm.com/press/fr/fr/rssfeed.wss?keyword=null&maxFeed=&feedType=RSS&topic=all": { "categories": ["technologie"] }, "https://korben.info/feed": { "categories": ["technologie"]}, + "https://www.techhive.com/index.rss": { "categories": ["en_technologie"]}, + "https://www.gnome.org/feed/": { "categories": ["en_technologie"]}, + "http://www.markshuttleworth.com/feed": { "categories": ["en_technologie"]}, "https://insights.ubuntu.com/feed/": { "categories": ["en_technologie"]}, "https://www-03.ibm.com/press/us/en/rssfeed.wss?keyword=null&maxFeed=&feedType=RSS&topic=all": { "categories": ["en_technologie"]}, "https://www.cnet.com/rss/news/": { "categories": ["en_technologie"]}, @@ -106,6 +109,7 @@ "https://www.androidheadlines.com/feed": { "categories": ["en_technologie"]}, "https://www.nasa.gov/rss/dyn/breaking_news.rss": { "categories": ["en_technologie"]}, "http://www.computerweekly.com/rss/RSS-Feed.xml": { "categories": ["en_technologie"]}, - "https://www.debian.org/News/news": { "categories": ["en_technologie"]} + "https://www.debian.org/News/news": { "categories": ["en_technologie"]}, + "https://www.theverge.com/rss/index.xml": { "categories": ["en_technologie"]} } } -- 2.7.4 From 373bdb548a4a9f7b9c1347888f446d448e65b65e Mon Sep 17 00:00:00 2001 From: Jean-Philippe Orsini Date: Thu, 2 Nov 2017 23:48:02 +0100 Subject: [PATCH 04/16] fixed typo --- war/src/main/resources/feeds.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/war/src/main/resources/feeds.json b/war/src/main/resources/feeds.json index 91e2a1e..c6a72c6 100644 --- a/war/src/main/resources/feeds.json +++ b/war/src/main/resources/feeds.json @@ -51,8 +51,8 @@ "language": "fr" }, { "id": "en_technologie", - "label": "Technologie", - "title": "Technologie", + "label": "Technology", + "title": "Technology", "language": "en" } ], -- 2.7.4 From e09972205a8f7f9b733a6f23262799d2dac48493 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Orsini Date: Fri, 3 Nov 2017 00:01:27 +0100 Subject: [PATCH 05/16] added some feeds --- war/src/main/resources/feeds.json | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/war/src/main/resources/feeds.json b/war/src/main/resources/feeds.json index c6a72c6..a0a2f9d 100644 --- a/war/src/main/resources/feeds.json +++ b/war/src/main/resources/feeds.json @@ -110,6 +110,10 @@ "https://www.nasa.gov/rss/dyn/breaking_news.rss": { "categories": ["en_technologie"]}, "http://www.computerweekly.com/rss/RSS-Feed.xml": { "categories": ["en_technologie"]}, "https://www.debian.org/News/news": { "categories": ["en_technologie"]}, - "https://www.theverge.com/rss/index.xml": { "categories": ["en_technologie"]} + "https://www.theverge.com/rss/index.xml": { "categories": ["en_technologie"]}, + "https://www.engadget.com/rss.xml": { "categories": ["en_technologie"]}, + "http://feeds.feedburner.com/TheBoyGeniusReport?format=xml": { "categories": ["en_technologie"]}, + "https://www.popsci.com/rss-technology.xml?loc=contentwell&lnk=tech&dom=section-1": { "categories": ["en_technologie"]}, + "https://gizmodo.com/rss": { "categories": ["en_technologie"]} } } -- 2.7.4 From cdd191d522dce1245032f0d92b201c3e0d679610 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Orsini Date: Fri, 3 Nov 2017 00:04:41 +0100 Subject: [PATCH 06/16] added feeds --- war/src/main/resources/feeds.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/war/src/main/resources/feeds.json b/war/src/main/resources/feeds.json index a0a2f9d..90df5cd 100644 --- a/war/src/main/resources/feeds.json +++ b/war/src/main/resources/feeds.json @@ -114,6 +114,7 @@ "https://www.engadget.com/rss.xml": { "categories": ["en_technologie"]}, "http://feeds.feedburner.com/TheBoyGeniusReport?format=xml": { "categories": ["en_technologie"]}, "https://www.popsci.com/rss-technology.xml?loc=contentwell&lnk=tech&dom=section-1": { "categories": ["en_technologie"]}, - "https://gizmodo.com/rss": { "categories": ["en_technologie"]} + "https://gizmodo.com/rss": { "categories": ["en_technologie"]}, + "https://www.space.com/home/feed/site.xml": { "categories": ["en_technologie"]} } } -- 2.7.4 From 7e7f5169cbba419822c4fd7e05a85e81972a9fd6 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Orsini Date: Fri, 3 Nov 2017 00:16:05 +0100 Subject: [PATCH 07/16] do classification in // --- war/src/main/java/pnews/NER.java | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/war/src/main/java/pnews/NER.java b/war/src/main/java/pnews/NER.java index 3a6fc82..bcb8951 100644 --- a/war/src/main/java/pnews/NER.java +++ b/war/src/main/java/pnews/NER.java @@ -12,10 +12,16 @@ import edu.stanford.nlp.util.Triple; /** https://stanfordnlp.github.io/CoreNLP/api.html */ public class NER { private static final String CLASS_NAME = NER.class.getName(); - private static final Logger LOG = Logger.getLogger(CLASS_NAME); + private static final Logger LOG = Logger.getLogger(CLASS_NAME); + private static final ThreadLocal> classifier = new ThreadLocal>() { + @Override + protected CRFClassifier initialValue() { + return CRFClassifier.getDefaultClassifier(); + } + }; public static List classify(String str, List entities) throws ClassCastException, ClassNotFoundException, IOException { - final CRFClassifier classifier = CRFClassifier.getDefaultClassifier(); + List> triples; String w; final String FUNCTION_NAME = "classify"; @@ -25,7 +31,7 @@ public class NER { OpenNLP.classify(str, entities); synchronized (classifier) { - triples = classifier.classifyToCharacterOffsets(str); + triples = classifier.get().classifyToCharacterOffsets(str); for (Triple t: triples) { w = str.substring(t.second, t.third); if (!entities.contains(w)) -- 2.7.4 From 63c2717409a3235573418e6bc0d9bd0fae8356e4 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Orsini Date: Fri, 3 Nov 2017 01:07:42 +0100 Subject: [PATCH 08/16] limit memory by using only one classifier instance added feeds --- war/src/main/java/pnews/NER.java | 23 ++++++++++------------- war/src/main/resources/feeds.json | 8 +++++++- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/war/src/main/java/pnews/NER.java b/war/src/main/java/pnews/NER.java index bcb8951..2745868 100644 --- a/war/src/main/java/pnews/NER.java +++ b/war/src/main/java/pnews/NER.java @@ -6,19 +6,14 @@ import java.util.List; import java.util.logging.Logger; import edu.stanford.nlp.ie.crf.CRFClassifier; -import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.Triple; /** https://stanfordnlp.github.io/CoreNLP/api.html */ public class NER { private static final String CLASS_NAME = NER.class.getName(); private static final Logger LOG = Logger.getLogger(CLASS_NAME); - private static final ThreadLocal> classifier = new ThreadLocal>() { - @Override - protected CRFClassifier initialValue() { - return CRFClassifier.getDefaultClassifier(); - } - }; + private static final CRFClassifier classifier = CRFClassifier.getDefaultClassifier(); public static List classify(String str, List entities) throws ClassCastException, ClassNotFoundException, IOException { @@ -31,18 +26,20 @@ public class NER { OpenNLP.classify(str, entities); synchronized (classifier) { - triples = classifier.get().classifyToCharacterOffsets(str); - for (Triple t: triples) { - w = str.substring(t.second, t.third); - if (!entities.contains(w)) - entities.add(w); - } + triples = classifier.classifyToCharacterOffsets(str); + } + + for (Triple t: triples) { + w = str.substring(t.second, t.third); + if (!entities.contains(w)) + entities.add(w); } entities.remove("CNET"); entities.remove("Read More"); entities.remove("New"); entities.remove("App"); + entities.remove("Digital Trends"); LOG.exiting(CLASS_NAME, FUNCTION_NAME, entities); diff --git a/war/src/main/resources/feeds.json b/war/src/main/resources/feeds.json index 90df5cd..dd2b0fd 100644 --- a/war/src/main/resources/feeds.json +++ b/war/src/main/resources/feeds.json @@ -54,6 +54,11 @@ "label": "Technology", "title": "Technology", "language": "en" + }, { + "id": "en_linux", + "label": "Linux", + "title": "Linux", + "language": "en" } ], "feeds": { @@ -115,6 +120,7 @@ "http://feeds.feedburner.com/TheBoyGeniusReport?format=xml": { "categories": ["en_technologie"]}, "https://www.popsci.com/rss-technology.xml?loc=contentwell&lnk=tech&dom=section-1": { "categories": ["en_technologie"]}, "https://gizmodo.com/rss": { "categories": ["en_technologie"]}, - "https://www.space.com/home/feed/site.xml": { "categories": ["en_technologie"]} + "https://www.space.com/home/feed/site.xml": { "categories": ["en_technologie"]}, + "http://feeds.feedburner.com/d0od?format=xml": { "categories": ["en_linux"]} } } -- 2.7.4 From 15e8383fbba3b69defce638682df6e59538ff4b3 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Orsini Date: Sat, 4 Nov 2017 22:17:48 +0100 Subject: [PATCH 09/16] english by default --- war/src/main/resources/feeds.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/war/src/main/resources/feeds.json b/war/src/main/resources/feeds.json index dd2b0fd..ecb8b1a 100644 --- a/war/src/main/resources/feeds.json +++ b/war/src/main/resources/feeds.json @@ -1,7 +1,7 @@ { "languages": [ - {"id": "fr"}, - {"id": "en"} + {"id": "en"}, + {"id": "fr"} ], "categories": [ { -- 2.7.4 From 6d94fd5a39e6f78a68201230bd57b9ceb95e125d Mon Sep 17 00:00:00 2001 From: Jean-Philippe Orsini Date: Sat, 4 Nov 2017 23:29:11 +0100 Subject: [PATCH 10/16] blacklisted entities are now in the configuration file --- war/src/main/java/pnews/NER.java | 15 +++----- war/src/main/java/pnews/OpenNLP.java | 15 ++++---- .../main/java/pnews/servlet/ArticleProvider.java | 6 ++-- war/src/main/java/pnews/servlet/Config.java | 42 +++++++++++++++++++++- war/src/main/resources/feeds.json | 23 +++++++++--- 5 files changed, 75 insertions(+), 26 deletions(-) diff --git a/war/src/main/java/pnews/NER.java b/war/src/main/java/pnews/NER.java index 2745868..2055cf1 100644 --- a/war/src/main/java/pnews/NER.java +++ b/war/src/main/java/pnews/NER.java @@ -8,6 +8,7 @@ import java.util.logging.Logger; import edu.stanford.nlp.ie.crf.CRFClassifier; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.Triple; +import pnews.servlet.Config; /** https://stanfordnlp.github.io/CoreNLP/api.html */ public class NER { @@ -15,7 +16,7 @@ public class NER { private static final Logger LOG = Logger.getLogger(CLASS_NAME); private static final CRFClassifier classifier = CRFClassifier.getDefaultClassifier(); - public static List classify(String str, List entities) throws ClassCastException, ClassNotFoundException, IOException { + public static List classify(String str, List entities, Config config) throws ClassCastException, ClassNotFoundException, IOException { List> triples; String w; @@ -23,7 +24,7 @@ public class NER { LOG.entering(CLASS_NAME, FUNCTION_NAME, str); - OpenNLP.classify(str, entities); + OpenNLP.classify(str, entities, config); synchronized (classifier) { triples = classifier.classifyToCharacterOffsets(str); @@ -31,16 +32,10 @@ public class NER { for (Triple t: triples) { w = str.substring(t.second, t.third); - if (!entities.contains(w)) + if (!config.isBlacklistedEntity(w) && !entities.contains(w)) entities.add(w); } - entities.remove("CNET"); - entities.remove("Read More"); - entities.remove("New"); - entities.remove("App"); - entities.remove("Digital Trends"); - LOG.exiting(CLASS_NAME, FUNCTION_NAME, entities); return entities; @@ -49,7 +44,7 @@ public class NER { public static void main(String[] args) throws Exception { List lst; - lst = classify("I live in Washington and New York in United States.", new ArrayList<>()); + lst = classify("I live in Washington and New York in United States.", new ArrayList<>(), new Config()); for (String str: lst) System.out.println(str); } diff --git a/war/src/main/java/pnews/OpenNLP.java b/war/src/main/java/pnews/OpenNLP.java index 07fbba5..c383cee 100644 --- a/war/src/main/java/pnews/OpenNLP.java +++ b/war/src/main/java/pnews/OpenNLP.java @@ -10,6 +10,7 @@ import opennlp.tools.namefind.TokenNameFinderModel; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.util.Span; +import pnews.servlet.Config; /** http://www.devglan.com/artificial-intelligence/opennlp-named-entity-recognition-example **/ public class OpenNLP { @@ -21,13 +22,13 @@ public class OpenNLP { private static TokenNameFinderModel timeModel; private static TokenizerModel tokenModel; - public static List classify(String str, List entities) throws IOException { - classify(str, getOrganizationModel(), entities); + public static List classify(String str, List entities, Config config) throws IOException { + classify(str, getOrganizationModel(), entities, config); - classify(str, getPersonModel(), entities); - classify(str, getLocationModel(), entities); + classify(str, getPersonModel(), entities, config); + classify(str, getLocationModel(), entities, config); - classify(str, getTimeModel(), entities); + classify(str, getTimeModel(), entities, config); return entities; } @@ -76,7 +77,7 @@ public class OpenNLP { return timeModel; } - private static List classify(String str, TokenNameFinderModel model, List entities) throws IOException { + private static List classify(String str, TokenNameFinderModel model, List entities, Config config) throws IOException { String entity; NameFinderME nameFinder = new NameFinderME(model); @@ -95,7 +96,7 @@ public class OpenNLP { entity += " " + tokens[i]; LOG.finest(entity + " " + s.getProb() + " " + s.toString()); - if (!entities.contains(entity)) + if (!config.isBlacklistedEntity(entity) && !entities.contains(entity)) entities.add(entity); } diff --git a/war/src/main/java/pnews/servlet/ArticleProvider.java b/war/src/main/java/pnews/servlet/ArticleProvider.java index c2d8f59..ec74123 100644 --- a/war/src/main/java/pnews/servlet/ArticleProvider.java +++ b/war/src/main/java/pnews/servlet/ArticleProvider.java @@ -74,7 +74,7 @@ public class ArticleProvider { return false; } - private static Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang) { + private Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang) { String desc, title, thumbnail, feedTitle, str; Date date; List entities; @@ -111,8 +111,8 @@ public class ArticleProvider { entities = new ArrayList<>(); if (desc != null && lang.equals("en")) try { - NER.classify(title, entities); - NER.classify(desc, entities); + NER.classify(title, entities, config); + NER.classify(desc, entities, config); } catch (ClassCastException | ClassNotFoundException | IOException e1) { LOG.log(Level.SEVERE, "Cannot classify " + feedTitle, e1); } diff --git a/war/src/main/java/pnews/servlet/Config.java b/war/src/main/java/pnews/servlet/Config.java index bafb606..fec3770 100644 --- a/war/src/main/java/pnews/servlet/Config.java +++ b/war/src/main/java/pnews/servlet/Config.java @@ -6,13 +6,17 @@ import java.io.Reader; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.logging.Level; import java.util.logging.Logger; import javax.json.Json; import javax.json.JsonArray; import javax.json.JsonObject; +import javax.json.JsonString; import javax.json.JsonValue; import pnews.Category; @@ -23,11 +27,14 @@ public class Config { private Feed[] feeds; private Category[] categories; private Language[] languages; + private final Set blacklistedEntities = new HashSet<>(); + private static final String CLASS_NAME = Config.class.getName(); + /** * The key is the language, the value is the default category for this language. */ private Map defaultCategories = new HashMap<>(); - private static final Logger LOG = Logger.getLogger(Config.class.getName()); + private static final Logger LOG = Logger.getLogger(CLASS_NAME); private void loadCategories(JsonArray jcats) { List cats; @@ -75,6 +82,24 @@ public class Config { return null; } + private void loadEntities(JsonObject jroot) { + JsonObject jentities; + JsonArray jblacklist; + final String METHOD_NAME = "loadEntities"; + + jentities = jroot.getJsonObject("entities"); + jblacklist = jentities.getJsonArray("blacklist"); + + jblacklist.forEach((jv)-> { + JsonString js; + + js = (JsonString)jv; + blacklistedEntities.add(js.getString()); + }); + + LOG.logp(Level.FINEST, CLASS_NAME, METHOD_NAME, " blacklistedEntities=" + blacklistedEntities); + } + public void loadConfig() throws UnsupportedEncodingException { Reader r; JsonObject jfeeds, jroot; @@ -116,6 +141,21 @@ public class Config { }); feeds = feedList.toArray(new Feed[0]); + + loadEntities(jroot); + } + + public boolean isBlacklistedEntity(String e) { + final String METHOD_NAME = "isBlacklistedEntity"; + boolean result; + + LOG.entering(CLASS_NAME, METHOD_NAME, e); + + result = blacklistedEntities.contains(e); + + LOG.exiting(CLASS_NAME, METHOD_NAME, result); + + return result; } public Feed[] getFeeds() { diff --git a/war/src/main/resources/feeds.json b/war/src/main/resources/feeds.json index ecb8b1a..a06ae41 100644 --- a/war/src/main/resources/feeds.json +++ b/war/src/main/resources/feeds.json @@ -91,6 +91,7 @@ "http://www.europe1.fr/var/export/rss/europe1/sciences.xml": { "categories": ["technologie"] }, "http://www.europe1.fr/var/export/rss/europe1/technologies.xml": { "categories": ["technologie"]}, "http://feeds.feedburner.com/lesnumeriques/news": { "categories": ["technologie"] }, + "https://www.nextinpact.com/rss/acces-libre.xml": { "categories": ["technologie"] }, "http://www.zdnet.fr/feeds/rss/actualites/": { "categories": ["technologie"] }, "http://www.frandroid.com/feed": { "categories": ["technologie"] }, "http://www.silicon.fr/feed": { "categories": ["technologie"] }, @@ -99,9 +100,6 @@ "https://www-03.ibm.com/press/fr/fr/rssfeed.wss?keyword=null&maxFeed=&feedType=RSS&topic=all": { "categories": ["technologie"] }, "https://korben.info/feed": { "categories": ["technologie"]}, "https://www.techhive.com/index.rss": { "categories": ["en_technologie"]}, - "https://www.gnome.org/feed/": { "categories": ["en_technologie"]}, - "http://www.markshuttleworth.com/feed": { "categories": ["en_technologie"]}, - "https://insights.ubuntu.com/feed/": { "categories": ["en_technologie"]}, "https://www-03.ibm.com/press/us/en/rssfeed.wss?keyword=null&maxFeed=&feedType=RSS&topic=all": { "categories": ["en_technologie"]}, "https://www.cnet.com/rss/news/": { "categories": ["en_technologie"]}, "https://www.pcworld.com/index.rss": { "categories": ["en_technologie"]}, @@ -114,13 +112,28 @@ "https://www.androidheadlines.com/feed": { "categories": ["en_technologie"]}, "https://www.nasa.gov/rss/dyn/breaking_news.rss": { "categories": ["en_technologie"]}, "http://www.computerweekly.com/rss/RSS-Feed.xml": { "categories": ["en_technologie"]}, - "https://www.debian.org/News/news": { "categories": ["en_technologie"]}, "https://www.theverge.com/rss/index.xml": { "categories": ["en_technologie"]}, "https://www.engadget.com/rss.xml": { "categories": ["en_technologie"]}, "http://feeds.feedburner.com/TheBoyGeniusReport?format=xml": { "categories": ["en_technologie"]}, "https://www.popsci.com/rss-technology.xml?loc=contentwell&lnk=tech&dom=section-1": { "categories": ["en_technologie"]}, "https://gizmodo.com/rss": { "categories": ["en_technologie"]}, "https://www.space.com/home/feed/site.xml": { "categories": ["en_technologie"]}, - "http://feeds.feedburner.com/d0od?format=xml": { "categories": ["en_linux"]} + "http://feeds.feedburner.com/d0od?format=xml": { "categories": ["en_linux"]}, + "https://www.debian.org/News/news": { "categories": ["en_linux"]}, + "http://www.markshuttleworth.com/feed": { "categories": ["en_linux"]}, + "https://insights.ubuntu.com/feed/": { "categories": ["en_linux"]}, + "http://feeds.feedburner.com/LinuxJournal-BreakingNews?format=xml": { "categories": ["en_linux"]}, + "https://www.gnome.org/feed/": { "categories": ["en_linux"]}, + "http://linuxreviews.org/en.rss": { "categories": ["en_linux"]}, + "http://www.linux-magazine.com/rss/feed/lmi_news": { "categories": ["en_linux"]} + }, + "entities": { + "blacklist": [ + "CNET", + "Read More", + "Digital Trends", + "Joey Sneddon", + "CA" + ] } } -- 2.7.4 From e93f5e690bea424f7b4aec8640c865ca92006db8 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Orsini Date: Sat, 4 Nov 2017 23:54:52 +0100 Subject: [PATCH 11/16] added feeds --- war/src/main/resources/feeds.json | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/war/src/main/resources/feeds.json b/war/src/main/resources/feeds.json index a06ae41..7ec9e49 100644 --- a/war/src/main/resources/feeds.json +++ b/war/src/main/resources/feeds.json @@ -56,8 +56,13 @@ "language": "en" }, { "id": "en_linux", - "label": "Linux", - "title": "Linux", + "label": "Linux & OSS", + "title": "Linux and OpenSource Software", + "language": "en" + }, { + "id": "en_space", + "label": "Space", + "title": "Space", "language": "en" } ], @@ -90,6 +95,7 @@ "http://www.generation-nt.com/export/rss.xml": { "categories": ["technologie"] }, "http://www.europe1.fr/var/export/rss/europe1/sciences.xml": { "categories": ["technologie"] }, "http://www.europe1.fr/var/export/rss/europe1/technologies.xml": { "categories": ["technologie"]}, + "http://www.esa.int/rssfeed/France": { "categories": ["technologie"]}, "http://feeds.feedburner.com/lesnumeriques/news": { "categories": ["technologie"] }, "https://www.nextinpact.com/rss/acces-libre.xml": { "categories": ["technologie"] }, "http://www.zdnet.fr/feeds/rss/actualites/": { "categories": ["technologie"] }, @@ -110,14 +116,12 @@ "https://www.digitaltrends.com/computing/feed/": { "categories": ["en_technologie"]}, "http://www.zdnet.com/news/rss.xml": { "categories": ["en_technologie"]}, "https://www.androidheadlines.com/feed": { "categories": ["en_technologie"]}, - "https://www.nasa.gov/rss/dyn/breaking_news.rss": { "categories": ["en_technologie"]}, "http://www.computerweekly.com/rss/RSS-Feed.xml": { "categories": ["en_technologie"]}, "https://www.theverge.com/rss/index.xml": { "categories": ["en_technologie"]}, "https://www.engadget.com/rss.xml": { "categories": ["en_technologie"]}, "http://feeds.feedburner.com/TheBoyGeniusReport?format=xml": { "categories": ["en_technologie"]}, "https://www.popsci.com/rss-technology.xml?loc=contentwell&lnk=tech&dom=section-1": { "categories": ["en_technologie"]}, "https://gizmodo.com/rss": { "categories": ["en_technologie"]}, - "https://www.space.com/home/feed/site.xml": { "categories": ["en_technologie"]}, "http://feeds.feedburner.com/d0od?format=xml": { "categories": ["en_linux"]}, "https://www.debian.org/News/news": { "categories": ["en_linux"]}, "http://www.markshuttleworth.com/feed": { "categories": ["en_linux"]}, @@ -125,7 +129,15 @@ "http://feeds.feedburner.com/LinuxJournal-BreakingNews?format=xml": { "categories": ["en_linux"]}, "https://www.gnome.org/feed/": { "categories": ["en_linux"]}, "http://linuxreviews.org/en.rss": { "categories": ["en_linux"]}, - "http://www.linux-magazine.com/rss/feed/lmi_news": { "categories": ["en_linux"]} + "http://www.linux-magazine.com/rss/feed/lmi_news": { "categories": ["en_linux"]}, + "http://lxer.com/module/newswire/headlines.rss": { "categories": ["en_linux"]}, + "https://opensource.com/feed": { "categories": ["en_linux"]}, + "https://www.space.com/home/feed/site.xml": { "categories": ["en_space"]}, + "https://www.nasa.gov/rss/dyn/breaking_news.rss": { "categories": ["en_space"]}, + "http://www.esa.int/rssfeed/Our_Activities/Space_News": { "categories": ["en_space"]}, + "https://phys.org/rss-feed/space-news/": { "categories": ["en_space"]}, + "http://spacenews.com/feed/": { "categories": ["en_space"]}, + "https://feeds.feedburner.com/ISF": { "categories": ["en_space"]} }, "entities": { "blacklist": [ -- 2.7.4 From 9ff314621235d6b748abb128edf0331480d0eaaf Mon Sep 17 00:00:00 2001 From: Jean-Philippe Orsini Date: Sun, 5 Nov 2017 00:13:41 +0100 Subject: [PATCH 12/16] added support of entity aliases --- war/src/main/java/pnews/NER.java | 2 +- war/src/main/java/pnews/OpenNLP.java | 2 +- war/src/main/java/pnews/servlet/Config.java | 26 ++++++++++++++++++++++++-- war/src/main/resources/feeds.json | 5 ++++- 4 files changed, 30 insertions(+), 5 deletions(-) diff --git a/war/src/main/java/pnews/NER.java b/war/src/main/java/pnews/NER.java index 2055cf1..5e7ce29 100644 --- a/war/src/main/java/pnews/NER.java +++ b/war/src/main/java/pnews/NER.java @@ -33,7 +33,7 @@ public class NER { for (Triple t: triples) { w = str.substring(t.second, t.third); if (!config.isBlacklistedEntity(w) && !entities.contains(w)) - entities.add(w); + entities.add(config.getEntityAlias(w)); } LOG.exiting(CLASS_NAME, FUNCTION_NAME, entities); diff --git a/war/src/main/java/pnews/OpenNLP.java b/war/src/main/java/pnews/OpenNLP.java index c383cee..e158a00 100644 --- a/war/src/main/java/pnews/OpenNLP.java +++ b/war/src/main/java/pnews/OpenNLP.java @@ -97,7 +97,7 @@ public class OpenNLP { LOG.finest(entity + " " + s.getProb() + " " + s.toString()); if (!config.isBlacklistedEntity(entity) && !entities.contains(entity)) - entities.add(entity); + entities.add(config.getEntityAlias(entity)); } return entities; diff --git a/war/src/main/java/pnews/servlet/Config.java b/war/src/main/java/pnews/servlet/Config.java index fec3770..46e75f6 100644 --- a/war/src/main/java/pnews/servlet/Config.java +++ b/war/src/main/java/pnews/servlet/Config.java @@ -28,6 +28,7 @@ public class Config { private Category[] categories; private Language[] languages; private final Set blacklistedEntities = new HashSet<>(); + private final HashMap entityAliases = new HashMap<>(); private static final String CLASS_NAME = Config.class.getName(); /** @@ -83,13 +84,13 @@ public class Config { } private void loadEntities(JsonObject jroot) { - JsonObject jentities; + JsonObject jentities, jaliases; JsonArray jblacklist; final String METHOD_NAME = "loadEntities"; jentities = jroot.getJsonObject("entities"); - jblacklist = jentities.getJsonArray("blacklist"); + jblacklist = jentities.getJsonArray("blacklist"); jblacklist.forEach((jv)-> { JsonString js; @@ -97,7 +98,28 @@ public class Config { blacklistedEntities.add(js.getString()); }); + jaliases = jentities.getJsonObject("aliases"); + jaliases.forEach((k, v)-> { + JsonArray jsources = (JsonArray)v; + + jsources.forEach((jsource)-> { + entityAliases.put(((JsonString)jsource).getString(), k); + }); + }); + LOG.logp(Level.FINEST, CLASS_NAME, METHOD_NAME, " blacklistedEntities=" + blacklistedEntities); + LOG.logp(Level.FINEST, CLASS_NAME, METHOD_NAME, " entityAliases=" + entityAliases); + } + + public String getEntityAlias(String entity) { + String result; + + result = entityAliases.get(entity); + + if (result == null) + return entity; + else + return result; } public void loadConfig() throws UnsupportedEncodingException { diff --git a/war/src/main/resources/feeds.json b/war/src/main/resources/feeds.json index 7ec9e49..337e19a 100644 --- a/war/src/main/resources/feeds.json +++ b/war/src/main/resources/feeds.json @@ -146,6 +146,9 @@ "Digital Trends", "Joey Sneddon", "CA" - ] + ], + "aliases": { + "U.S.": ["United States", "US"] + } } } -- 2.7.4 From 975f59a4767a4fc4f298b87abf2a5b415e988726 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Orsini Date: Sun, 5 Nov 2017 00:22:17 +0100 Subject: [PATCH 13/16] cut very long descriptions --- war/src/main/java/pnews/servlet/HTML.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/war/src/main/java/pnews/servlet/HTML.java b/war/src/main/java/pnews/servlet/HTML.java index 72f0b4b..d20d415 100644 --- a/war/src/main/java/pnews/servlet/HTML.java +++ b/war/src/main/java/pnews/servlet/HTML.java @@ -66,7 +66,12 @@ public class HTML { buf.append("
    \n"); if (a.description != null) { buf.append("

    "); - buf.append(a.description); + if (a.description.length() < 512) { + buf.append(a.description); + } else { + buf.append(a.description.substring(0, 512)); + buf.append("[..]"); + } buf.append("

    "); } buf.append("
    \n"); -- 2.7.4 From dca870436a8fbd07219ee7bb6526ead617d6768a Mon Sep 17 00:00:00 2001 From: Jean-Philippe Orsini Date: Sun, 5 Nov 2017 00:58:26 +0100 Subject: [PATCH 14/16] added feeds and aliases --- war/src/main/resources/feeds.json | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/war/src/main/resources/feeds.json b/war/src/main/resources/feeds.json index 337e19a..0078ea8 100644 --- a/war/src/main/resources/feeds.json +++ b/war/src/main/resources/feeds.json @@ -99,6 +99,7 @@ "http://feeds.feedburner.com/lesnumeriques/news": { "categories": ["technologie"] }, "https://www.nextinpact.com/rss/acces-libre.xml": { "categories": ["technologie"] }, "http://www.zdnet.fr/feeds/rss/actualites/": { "categories": ["technologie"] }, + "http://www.tomshardware.fr/feeds/rss2/all.xml": { "categories": ["technologie"] }, "http://www.frandroid.com/feed": { "categories": ["technologie"] }, "http://www.silicon.fr/feed": { "categories": ["technologie"] }, "http://www.fredzone.org/feed": { "categories": ["technologie"] }, @@ -120,6 +121,7 @@ "https://www.theverge.com/rss/index.xml": { "categories": ["en_technologie"]}, "https://www.engadget.com/rss.xml": { "categories": ["en_technologie"]}, "http://feeds.feedburner.com/TheBoyGeniusReport?format=xml": { "categories": ["en_technologie"]}, + "http://www.tomshardware.com/feeds/rss2/all.xml": { "categories": ["en_technologie"]}, "https://www.popsci.com/rss-technology.xml?loc=contentwell&lnk=tech&dom=section-1": { "categories": ["en_technologie"]}, "https://gizmodo.com/rss": { "categories": ["en_technologie"]}, "http://feeds.feedburner.com/d0od?format=xml": { "categories": ["en_linux"]}, @@ -148,7 +150,8 @@ "CA" ], "aliases": { - "U.S.": ["United States", "US"] + "U.S.": ["United States", "US"], + "U.K.": ["UK", "United Kingdom"] } } } -- 2.7.4 From 826fb4bca1e6056890a702dd52abc22b3fb93101 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Orsini Date: Sun, 5 Nov 2017 01:18:05 +0100 Subject: [PATCH 15/16] added feeds --- war/src/main/resources/feeds.json | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/war/src/main/resources/feeds.json b/war/src/main/resources/feeds.json index 0078ea8..06ce1fe 100644 --- a/war/src/main/resources/feeds.json +++ b/war/src/main/resources/feeds.json @@ -122,6 +122,7 @@ "https://www.engadget.com/rss.xml": { "categories": ["en_technologie"]}, "http://feeds.feedburner.com/TheBoyGeniusReport?format=xml": { "categories": ["en_technologie"]}, "http://www.tomshardware.com/feeds/rss2/all.xml": { "categories": ["en_technologie"]}, + "https://lifehacker.com/rss": { "categories": ["en_technologie"]}, "https://www.popsci.com/rss-technology.xml?loc=contentwell&lnk=tech&dom=section-1": { "categories": ["en_technologie"]}, "https://gizmodo.com/rss": { "categories": ["en_technologie"]}, "http://feeds.feedburner.com/d0od?format=xml": { "categories": ["en_linux"]}, @@ -134,6 +135,8 @@ "http://www.linux-magazine.com/rss/feed/lmi_news": { "categories": ["en_linux"]}, "http://lxer.com/module/newswire/headlines.rss": { "categories": ["en_linux"]}, "https://opensource.com/feed": { "categories": ["en_linux"]}, + "https://www.infoworld.com/category/linux/index.rss": { "categories": ["en_linux"]}, + "https://www.infoworld.com/category/open-source-tools/index.rss": { "categories": ["en_linux"]}, "https://www.space.com/home/feed/site.xml": { "categories": ["en_space"]}, "https://www.nasa.gov/rss/dyn/breaking_news.rss": { "categories": ["en_space"]}, "http://www.esa.int/rssfeed/Our_Activities/Space_News": { "categories": ["en_space"]}, @@ -147,11 +150,13 @@ "Read More", "Digital Trends", "Joey Sneddon", - "CA" + "CA", + "Read" ], "aliases": { "U.S.": ["United States", "US"], - "U.K.": ["UK", "United Kingdom"] + "U.K.": ["UK", "United Kingdom"], + "AWS": ["Amazon Web Service"] } } } -- 2.7.4 From 3b83a1595fc151d27a2074dcc0effa7f120227cd Mon Sep 17 00:00:00 2001 From: Jean-Philippe Orsini Date: Sun, 5 Nov 2017 01:19:54 +0100 Subject: [PATCH 16/16] fixed typo in aliases --- war/src/main/resources/feeds.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/war/src/main/resources/feeds.json b/war/src/main/resources/feeds.json index 06ce1fe..b10615c 100644 --- a/war/src/main/resources/feeds.json +++ b/war/src/main/resources/feeds.json @@ -156,7 +156,7 @@ "aliases": { "U.S.": ["United States", "US"], "U.K.": ["UK", "United Kingdom"], - "AWS": ["Amazon Web Service"] + "AWS": ["Amazon Web Services"] } } } -- 2.7.4