find hot named entities using stanford ner
authorJean-Philippe Orsini <orsinije@fr.ibm.com>
Mon, 30 Oct 2017 22:21:07 +0000 (23:21 +0100)
committerJean-Philippe Orsini <orsinije@fr.ibm.com>
Mon, 30 Oct 2017 22:21:07 +0000 (23:21 +0100)
war/src/main/java/pnews/Article.java
war/src/main/java/pnews/EntityStat.java [new file with mode: 0644]
war/src/main/java/pnews/NER.java
war/src/main/java/pnews/servlet/ArticleProvider.java
war/src/main/java/pnews/servlet/HTML.java
war/src/main/java/pnews/servlet/Pnews.java
war/src/main/resources/feeds.json

index ad42e7e..70f69ea 100644 (file)
@@ -10,14 +10,20 @@ public class Article {
         public final String link;
         public final Date publicationDate;
         public final String website;
+        public final String[] entities;
         public final AtomicLong readCount = new AtomicLong();
         
-        public Article(String link, String title, String description, String thumbnail, Date publicationDate, String website) {
+        public Article(String link, String title, String description, String thumbnail, Date publicationDate, String website, String[] entities) {
                 this.link = link;
                 this.title = title;
                 this.description = description;
                 this.thumbnail = thumbnail;
                 this.publicationDate = publicationDate;
                 this.website = website;
+                this.entities = entities;
+        }
+        
+        public String[] getEntities() {
+                return entities;
         }
 }
diff --git a/war/src/main/java/pnews/EntityStat.java b/war/src/main/java/pnews/EntityStat.java
new file mode 100644 (file)
index 0000000..a2fcb25
--- /dev/null
@@ -0,0 +1,27 @@
+package pnews;
+
+public class EntityStat {
+        private final String entity;
+        private int count;
+        
+        public EntityStat(String entity) {
+                this.entity = entity;
+        }
+        
+        public void increment() {
+                count++;
+        }
+        
+        public int getCount() {
+                return count;
+        }
+        
+        public String getEntity() {
+                return entity;
+        }
+        
+        @Override
+        public String toString() {
+                return entity + "(" + count + ")";
+        }
+}
index 2868239..f8238c1 100644 (file)
@@ -1,7 +1,9 @@
 package pnews;
 
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.List;
+import java.util.logging.Logger;
 
 import edu.stanford.nlp.ie.crf.CRFClassifier;
 import edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation;
@@ -9,20 +11,33 @@ import edu.stanford.nlp.ling.CoreLabel;
 
 /** https://stanfordnlp.github.io/CoreNLP/api.html */
 public class NER {
-        public static void classify(String str) throws ClassCastException, ClassNotFoundException, IOException {
+        private static final String CLASS_NAME = NER.class.getName();
+        private static final Logger LOG = Logger.getLogger(CLASS_NAME); 
+        
+        public static String[] classify(String str) throws ClassCastException, ClassNotFoundException, IOException {
                 CRFClassifier<CoreLabel> classifier;
                 List<List<CoreLabel>> out;
                 String cat, w;
+                List<String> entities;
+                final String FUNCTION_NAME = "classify";                
+                
+                LOG.entering(CLASS_NAME, FUNCTION_NAME, str);
                 
                 classifier = CRFClassifier.getDefaultClassifier();
                 out = classifier.classify(str);
                 
+                entities = new ArrayList<>();
                 for (List<CoreLabel> labels: out)
                         for (CoreLabel l: labels) {
                                 cat = l.getString(AnswerAnnotation.class);
                                 w = l.word();
-                                System.out.println(cat + " " + w);
+                                if (!cat.equals("O") && !entities.contains(w))
+                                        entities.add(w);
                         }
+                
+                LOG.exiting(CLASS_NAME, FUNCTION_NAME, entities);
+                
+                return entities.toArray(new String[0]);
         }
         
         public static void main(String[] args) throws Exception {
index 3bef20e..05ddedd 100644 (file)
@@ -27,10 +27,13 @@ import com.rometools.rome.io.XmlReader;
 
 import pnews.Article;
 import pnews.Category;
+import pnews.EntityStat;
 import pnews.Feed;
+import pnews.NER;
 
 public class ArticleProvider {
-        private static final Logger LOG = Logger.getLogger(ArticleProvider.class.getName());
+        private static final String CLASS_NAME = ArticleProvider.class.getName();
+        private static final Logger LOG = Logger.getLogger(CLASS_NAME);
         private final Map<Category, List<Article>> articlesByCategory = new HashMap<>();
         private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(2);
         private final Config config;
@@ -71,9 +74,10 @@ public class ArticleProvider {
                 return false;
         }
         
-        private static Article toArticle(String link, SyndEntry entry, SyndFeed feed) {
+        private static Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang) {
                 String desc, title, thumbnail, feedTitle, str;
                 Date date;
+                String[] entities;
                 
                 feedTitle = feed.getTitle();
                 if (feedTitle != null) {
@@ -107,7 +111,16 @@ public class ArticleProvider {
                 if (date == null)
                         LOG.severe("The article " + feedTitle + " - " + title + " does not have a date");
                                      
-                return new Article(link, title, desc, thumbnail, date, feedTitle);
+                
+                entities = null;
+                if (desc != null && lang.equals("en"))
+                        try {
+                                entities = NER.classify(desc);
+                        } catch (ClassCastException | ClassNotFoundException | IOException e1) {
+                                LOG.log(Level.SEVERE, "Cannot classify " + feedTitle, e1);                         
+                        }
+                
+                return new Article(link, title, desc, thumbnail, date, feedTitle, entities);
         }
         
         private void addArticles(Category cat, SyndFeed feed) {
@@ -127,7 +140,7 @@ public class ArticleProvider {
                                 continue ;
                         }
                         
-                        a = ArticleStore.singleton.getArticle(link, ()->toArticle(link, entry, feed));
+                        a = ArticleStore.singleton.getArticle(link, ()->toArticle(link, entry, feed, cat.getLanguage()));
                         
                         synchronized (articles) {
                                 articles.add(a);
@@ -184,6 +197,44 @@ public class ArticleProvider {
                 }
         }
         
+        public List<EntityStat> getEntityStats(Category cat) throws IllegalArgumentException, MalformedURLException, FeedException, IOException {
+                List<Article> articles;
+                Map<String, EntityStat> entities;
+                final String FUNCTION_NAME = "getEntities";
+                EntityStat s;
+                List<EntityStat> stats;
+                
+                LOG.entering(CLASS_NAME, FUNCTION_NAME, cat);
+                
+                articles = getArticles(cat);
+                
+                entities = new HashMap<>();
+                for (Article a: articles) 
+                        if (a.getEntities() != null)
+                                for (String e: a.getEntities()) {
+                                        s = entities.get(e);
+                                        if (s == null) {
+                                                s = new EntityStat(e);
+                                                entities.put(e,  s);
+                                        }
+                                        s.increment();
+                                }                
+                
+                stats = new ArrayList<>(entities.values());
+                stats.sort(new Comparator<EntityStat>() {
+
+                        @Override
+                        public int compare(EntityStat o1, EntityStat o2) {
+                                return Integer.compare(o2.getCount(), o1.getCount());
+                        }
+                        
+                });
+                
+                LOG.exiting(CLASS_NAME, FUNCTION_NAME, stats);
+                
+                return stats;
+        }
+        
         private class Refresher implements Runnable {
                 private final Category category;
                 
index a9ad838..89ce7c5 100644 (file)
@@ -1,13 +1,17 @@
 package pnews.servlet;
 
+import java.io.IOException;
 import java.io.UnsupportedEncodingException;
 import java.net.URLEncoder;
 import java.util.List;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
+import com.rometools.rome.io.FeedException;
+
 import pnews.Article;
 import pnews.Category;
+import pnews.EntityStat;
 import pnews.Language;
 
 public class HTML {
@@ -101,10 +105,11 @@ public class HTML {
                buf.append("</nav>\n");
        }
        
-       public static String toHTML(List<Article> articles, Category catActive, Config cfg) {
+       public static String toHTML(List<Article> articles, Category catActive, Config cfg, ArticleProvider provider) {
                StringBuffer buf;
                int i;
                Category[] cats;
+               List<EntityStat> entities;
                
                buf = new StringBuffer();
                buf.append("<!DOCTYPE html>\n");
@@ -122,6 +127,27 @@ public class HTML {
                
                appendMenu(buf, catActive, cfg);
                
+               try {
+                       entities = provider.getEntityStats(catActive);
+
+                       if (entities.size() > 0) {
+                               buf.append("Hot topics: ");
+                               buf.append("<ul>");
+                               i = 0;
+                               for (EntityStat s: entities) {
+                                       buf.append("<li>");
+                                       buf.append(s.getEntity());
+                                       buf.append("</li>");
+                                       i++;
+                                       if (i > 10)
+                                               break;
+                               }                               
+                               buf.append("</ul>");
+                       }
+                } catch (IllegalArgumentException | FeedException | IOException e2) {
+                        LOG.log(Level.SEVERE, "Failed to get entities", e2);
+                }
+               
                i = 0;
                for (Article e: articles) {
                        try {
index 029f3c7..777f7fd 100644 (file)
@@ -112,13 +112,13 @@ public class Pnews extends HttpServlet {
                 try {
                         articles = provider.getArticles(cat);
                         if (articles != null) {
-                                html = HTML.toHTML(articles, cat, config);
+                                html = HTML.toHTML(articles, cat, config, provider);
                                 rp.setContentType("text/html;charset=utf-8");
                                 rp.getWriter().write(html);
                                 rp.setCharacterEncoding("utf-8");
                         } else {
                                 LOG.severe("writeArticles cannot retrieve any articles");
-                                html = HTML.toHTML(new ArrayList<Article>(), cat, config);
+                                html = HTML.toHTML(new ArrayList<Article>(), cat, config, provider);
                                 rp.setContentType("text/html");
                                 rp.getWriter().write(html);
                         }
index 7e971d0..c0d625f 100644 (file)
@@ -50,9 +50,9 @@
                         "title": "People",
                         "language": "fr"
                 }, {
-                        "id": "ubuntu",
-                        "label": "Ubuntu",
-                        "title": "Ubuntu",
+                        "id": "en_technologie",
+                        "label": "Technologie",
+                        "title": "Technologie",
                         "language": "en"
                 }
         ],
@@ -81,6 +81,7 @@
                 "http://www.ville-palaiseau.fr/rss/actualites.htm": { "categories": ["essonne"] },
                 "http://www.premiere.fr/rss/actu-live": { "categories": ["people"] },
                 "http://www.purepeople.com/rss/news_t0.xml": { "categories": ["people"] },
+                "http://www.01net.com/rss/info/flux-rss/flux-toutes-les-actualites/": { "categories": ["technologie"] },
                 "http://www.generation-nt.com/export/rss.xml": { "categories": ["technologie"] },
                 "http://www.europe1.fr/var/export/rss/europe1/sciences.xml": { "categories": ["technologie"] },
                 "http://www.europe1.fr/var/export/rss/europe1/technologies.xml": { "categories": ["technologie"]},
                 "http://www.futura-sciences.com/rss/actualites.xml": { "categories": ["technologie"] },
                 "https://www-03.ibm.com/press/fr/fr/rssfeed.wss?keyword=null&maxFeed=&feedType=RSS&topic=all": { "categories": ["technologie"] },
                 "https://korben.info/feed": { "categories": ["technologie"]},
-                "https://insights.ubuntu.com/feed/": { "categories": ["ubuntu"]}
+                "https://insights.ubuntu.com/feed/": { "categories": ["en_technologie"]},
+                "https://www-03.ibm.com/press/us/en/rssfeed.wss?keyword=null&maxFeed=&feedType=RSS&topic=all": { "categories": ["en_technologie"]},
+                "https://www.cnet.com/rss/news/": { "categories": ["en_technologie"]},
+                "https://www.pcworld.com/index.rss": { "categories": ["en_technologie"]},
+                "https://www.technologyreview.com/c/computing/rss/": { "categories": ["en_technologie"]},
+                "https://www.techworld.com/news/rss": { "categories": ["en_technologie"]},
+                "http://feeds.feedburner.com/TechCrunch/": { "categories": ["en_technologie"]},
+                "http://feeds.macrumors.com/MacRumors-All": { "categories": ["en_technologie"]},
+                "https://www.digitaltrends.com/rss-home/": { "categories": ["en_technologie"]},
+                "http://www.zdnet.com/news/rss.xml": { "categories": ["en_technologie"]},
+                "https://www.androidheadlines.com/feed": { "categories": ["en_technologie"]}
         }
 }