From: Jean-Philippe Orsini Date: Mon, 30 Oct 2017 22:21:07 +0000 (+0100) Subject: find hot named entities using stanford ner X-Git-Url: https://git.wpitchoune.net/gitweb/?p=pnews.git;a=commitdiff_plain;h=56c07f5de3319eb61182b7100855801644538e6f find hot named entities using stanford ner --- diff --git a/war/src/main/java/pnews/Article.java b/war/src/main/java/pnews/Article.java index ad42e7e..70f69ea 100644 --- a/war/src/main/java/pnews/Article.java +++ b/war/src/main/java/pnews/Article.java @@ -10,14 +10,20 @@ public class Article { public final String link; public final Date publicationDate; public final String website; + public final String[] entities; public final AtomicLong readCount = new AtomicLong(); - public Article(String link, String title, String description, String thumbnail, Date publicationDate, String website) { + public Article(String link, String title, String description, String thumbnail, Date publicationDate, String website, String[] entities) { this.link = link; this.title = title; this.description = description; this.thumbnail = thumbnail; this.publicationDate = publicationDate; this.website = website; + this.entities = entities; + } + + public String[] getEntities() { + return entities; } } diff --git a/war/src/main/java/pnews/EntityStat.java b/war/src/main/java/pnews/EntityStat.java new file mode 100644 index 0000000..a2fcb25 --- /dev/null +++ b/war/src/main/java/pnews/EntityStat.java @@ -0,0 +1,27 @@ +package pnews; + +public class EntityStat { + private final String entity; + private int count; + + public EntityStat(String entity) { + this.entity = entity; + } + + public void increment() { + count++; + } + + public int getCount() { + return count; + } + + public String getEntity() { + return entity; + } + + @Override + public String toString() { + return entity + "(" + count + ")"; + } +} diff --git a/war/src/main/java/pnews/NER.java b/war/src/main/java/pnews/NER.java index 2868239..f8238c1 100644 --- a/war/src/main/java/pnews/NER.java +++ b/war/src/main/java/pnews/NER.java @@ -1,7 +1,9 @@ package pnews; import java.io.IOException; +import java.util.ArrayList; import java.util.List; +import java.util.logging.Logger; import edu.stanford.nlp.ie.crf.CRFClassifier; import edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation; @@ -9,20 +11,33 @@ import edu.stanford.nlp.ling.CoreLabel; /** https://stanfordnlp.github.io/CoreNLP/api.html */ public class NER { - public static void classify(String str) throws ClassCastException, ClassNotFoundException, IOException { + private static final String CLASS_NAME = NER.class.getName(); + private static final Logger LOG = Logger.getLogger(CLASS_NAME); + + public static String[] classify(String str) throws ClassCastException, ClassNotFoundException, IOException { CRFClassifier classifier; List> out; String cat, w; + List entities; + final String FUNCTION_NAME = "classify"; + + LOG.entering(CLASS_NAME, FUNCTION_NAME, str); classifier = CRFClassifier.getDefaultClassifier(); out = classifier.classify(str); + entities = new ArrayList<>(); for (List labels: out) for (CoreLabel l: labels) { cat = l.getString(AnswerAnnotation.class); w = l.word(); - System.out.println(cat + " " + w); + if (!cat.equals("O") && !entities.contains(w)) + entities.add(w); } + + LOG.exiting(CLASS_NAME, FUNCTION_NAME, entities); + + return entities.toArray(new String[0]); } public static void main(String[] args) throws Exception { diff --git a/war/src/main/java/pnews/servlet/ArticleProvider.java b/war/src/main/java/pnews/servlet/ArticleProvider.java index 3bef20e..05ddedd 100644 --- a/war/src/main/java/pnews/servlet/ArticleProvider.java +++ b/war/src/main/java/pnews/servlet/ArticleProvider.java @@ -27,10 +27,13 @@ import com.rometools.rome.io.XmlReader; import pnews.Article; import pnews.Category; +import pnews.EntityStat; import pnews.Feed; +import pnews.NER; public class ArticleProvider { - private static final Logger LOG = Logger.getLogger(ArticleProvider.class.getName()); + private static final String CLASS_NAME = ArticleProvider.class.getName(); + private static final Logger LOG = Logger.getLogger(CLASS_NAME); private final Map> articlesByCategory = new HashMap<>(); private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(2); private final Config config; @@ -71,9 +74,10 @@ public class ArticleProvider { return false; } - private static Article toArticle(String link, SyndEntry entry, SyndFeed feed) { + private static Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang) { String desc, title, thumbnail, feedTitle, str; Date date; + String[] entities; feedTitle = feed.getTitle(); if (feedTitle != null) { @@ -107,7 +111,16 @@ public class ArticleProvider { if (date == null) LOG.severe("The article " + feedTitle + " - " + title + " does not have a date"); - return new Article(link, title, desc, thumbnail, date, feedTitle); + + entities = null; + if (desc != null && lang.equals("en")) + try { + entities = NER.classify(desc); + } catch (ClassCastException | ClassNotFoundException | IOException e1) { + LOG.log(Level.SEVERE, "Cannot classify " + feedTitle, e1); + } + + return new Article(link, title, desc, thumbnail, date, feedTitle, entities); } private void addArticles(Category cat, SyndFeed feed) { @@ -127,7 +140,7 @@ public class ArticleProvider { continue ; } - a = ArticleStore.singleton.getArticle(link, ()->toArticle(link, entry, feed)); + a = ArticleStore.singleton.getArticle(link, ()->toArticle(link, entry, feed, cat.getLanguage())); synchronized (articles) { articles.add(a); @@ -184,6 +197,44 @@ public class ArticleProvider { } } + public List getEntityStats(Category cat) throws IllegalArgumentException, MalformedURLException, FeedException, IOException { + List
articles; + Map entities; + final String FUNCTION_NAME = "getEntities"; + EntityStat s; + List stats; + + LOG.entering(CLASS_NAME, FUNCTION_NAME, cat); + + articles = getArticles(cat); + + entities = new HashMap<>(); + for (Article a: articles) + if (a.getEntities() != null) + for (String e: a.getEntities()) { + s = entities.get(e); + if (s == null) { + s = new EntityStat(e); + entities.put(e, s); + } + s.increment(); + } + + stats = new ArrayList<>(entities.values()); + stats.sort(new Comparator() { + + @Override + public int compare(EntityStat o1, EntityStat o2) { + return Integer.compare(o2.getCount(), o1.getCount()); + } + + }); + + LOG.exiting(CLASS_NAME, FUNCTION_NAME, stats); + + return stats; + } + private class Refresher implements Runnable { private final Category category; diff --git a/war/src/main/java/pnews/servlet/HTML.java b/war/src/main/java/pnews/servlet/HTML.java index a9ad838..89ce7c5 100644 --- a/war/src/main/java/pnews/servlet/HTML.java +++ b/war/src/main/java/pnews/servlet/HTML.java @@ -1,13 +1,17 @@ package pnews.servlet; +import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; +import com.rometools.rome.io.FeedException; + import pnews.Article; import pnews.Category; +import pnews.EntityStat; import pnews.Language; public class HTML { @@ -101,10 +105,11 @@ public class HTML { buf.append("\n"); } - public static String toHTML(List
articles, Category catActive, Config cfg) { + public static String toHTML(List
articles, Category catActive, Config cfg, ArticleProvider provider) { StringBuffer buf; int i; Category[] cats; + List entities; buf = new StringBuffer(); buf.append("\n"); @@ -122,6 +127,27 @@ public class HTML { appendMenu(buf, catActive, cfg); + try { + entities = provider.getEntityStats(catActive); + + if (entities.size() > 0) { + buf.append("Hot topics: "); + buf.append("
    "); + i = 0; + for (EntityStat s: entities) { + buf.append("
  • "); + buf.append(s.getEntity()); + buf.append("
  • "); + i++; + if (i > 10) + break; + } + buf.append("
"); + } + } catch (IllegalArgumentException | FeedException | IOException e2) { + LOG.log(Level.SEVERE, "Failed to get entities", e2); + } + i = 0; for (Article e: articles) { try { diff --git a/war/src/main/java/pnews/servlet/Pnews.java b/war/src/main/java/pnews/servlet/Pnews.java index 029f3c7..777f7fd 100644 --- a/war/src/main/java/pnews/servlet/Pnews.java +++ b/war/src/main/java/pnews/servlet/Pnews.java @@ -112,13 +112,13 @@ public class Pnews extends HttpServlet { try { articles = provider.getArticles(cat); if (articles != null) { - html = HTML.toHTML(articles, cat, config); + html = HTML.toHTML(articles, cat, config, provider); rp.setContentType("text/html;charset=utf-8"); rp.getWriter().write(html); rp.setCharacterEncoding("utf-8"); } else { LOG.severe("writeArticles cannot retrieve any articles"); - html = HTML.toHTML(new ArrayList
(), cat, config); + html = HTML.toHTML(new ArrayList
(), cat, config, provider); rp.setContentType("text/html"); rp.getWriter().write(html); } diff --git a/war/src/main/resources/feeds.json b/war/src/main/resources/feeds.json index 7e971d0..c0d625f 100644 --- a/war/src/main/resources/feeds.json +++ b/war/src/main/resources/feeds.json @@ -50,9 +50,9 @@ "title": "People", "language": "fr" }, { - "id": "ubuntu", - "label": "Ubuntu", - "title": "Ubuntu", + "id": "en_technologie", + "label": "Technologie", + "title": "Technologie", "language": "en" } ], @@ -81,6 +81,7 @@ "http://www.ville-palaiseau.fr/rss/actualites.htm": { "categories": ["essonne"] }, "http://www.premiere.fr/rss/actu-live": { "categories": ["people"] }, "http://www.purepeople.com/rss/news_t0.xml": { "categories": ["people"] }, + "http://www.01net.com/rss/info/flux-rss/flux-toutes-les-actualites/": { "categories": ["technologie"] }, "http://www.generation-nt.com/export/rss.xml": { "categories": ["technologie"] }, "http://www.europe1.fr/var/export/rss/europe1/sciences.xml": { "categories": ["technologie"] }, "http://www.europe1.fr/var/export/rss/europe1/technologies.xml": { "categories": ["technologie"]}, @@ -92,6 +93,16 @@ "http://www.futura-sciences.com/rss/actualites.xml": { "categories": ["technologie"] }, "https://www-03.ibm.com/press/fr/fr/rssfeed.wss?keyword=null&maxFeed=&feedType=RSS&topic=all": { "categories": ["technologie"] }, "https://korben.info/feed": { "categories": ["technologie"]}, - "https://insights.ubuntu.com/feed/": { "categories": ["ubuntu"]} + "https://insights.ubuntu.com/feed/": { "categories": ["en_technologie"]}, + "https://www-03.ibm.com/press/us/en/rssfeed.wss?keyword=null&maxFeed=&feedType=RSS&topic=all": { "categories": ["en_technologie"]}, + "https://www.cnet.com/rss/news/": { "categories": ["en_technologie"]}, + "https://www.pcworld.com/index.rss": { "categories": ["en_technologie"]}, + "https://www.technologyreview.com/c/computing/rss/": { "categories": ["en_technologie"]}, + "https://www.techworld.com/news/rss": { "categories": ["en_technologie"]}, + "http://feeds.feedburner.com/TechCrunch/": { "categories": ["en_technologie"]}, + "http://feeds.macrumors.com/MacRumors-All": { "categories": ["en_technologie"]}, + "https://www.digitaltrends.com/rss-home/": { "categories": ["en_technologie"]}, + "http://www.zdnet.com/news/rss.xml": { "categories": ["en_technologie"]}, + "https://www.androidheadlines.com/feed": { "categories": ["en_technologie"]} } }