cleanup and refactored to move to net.wpitchoune package
[pnews.git] / war / src / main / java / pnews / servlet / ArticleProvider.java
index 48c551b..55898a4 100644 (file)
@@ -3,6 +3,8 @@ package pnews.servlet;
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.net.URL;
+import java.time.Instant;
+import java.time.temporal.ChronoUnit;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
@@ -25,17 +27,24 @@ import com.rometools.rome.io.FeedException;
 import com.rometools.rome.io.SyndFeedInput;
 import com.rometools.rome.io.XmlReader;
 
-import pnews.Article;
-import pnews.Category;
+import net.wpitchoune.pnews.Article;
+import net.wpitchoune.pnews.ArticleStore;
+import net.wpitchoune.pnews.Category;
+import net.wpitchoune.pnews.Config;
+import net.wpitchoune.pnews.EntityStat;
+import net.wpitchoune.pnews.Feed;
+import net.wpitchoune.pnews.classifier.NamedEntityRecognizer;
 
 public class ArticleProvider {
-        public final static ArticleProvider singleton = new ArticleProvider();
-        private static final Logger LOG = Logger.getLogger(ArticleProvider.class.getName());
+        private static final String CLASS_NAME = ArticleProvider.class.getName();
+        private static final Logger LOG = Logger.getLogger(CLASS_NAME);
         private final Map<Category, List<Article>> articlesByCategory = new HashMap<>();
-        private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(2);
+        private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(Runtime.getRuntime().availableProcessors());
+        private final Config config;
         
-        private ArticleProvider() {      
-                for (Category cat:Category.values())
+        public ArticleProvider(Config config) {
+                this.config = config;
+                for (Category cat: config.getCategories())
                         scheduler.scheduleAtFixedRate(new Refresher(cat), 2, 600, TimeUnit.SECONDS);
         }
         
@@ -47,8 +56,6 @@ public class ArticleProvider {
                 return new SyndFeedInput().build(r);                
         }
         
-        
-        
         private List<Article> getArticlesForUpdate(Category cat) {
                 List<Article> result;
                 
@@ -65,16 +72,29 @@ public class ArticleProvider {
         private boolean exists(String articleLink, List<Article> articles) {
                 synchronized (articles) {
                         for (Article a: articles)
-                                if (a.link.equals(articleLink))
+                                if (a.getLink().equals(articleLink))
                                         return true;
                 }
                 return false;
         }
         
-        private static Article toArticle(String link, SyndEntry entry, SyndFeed feed) {
-                String desc, title, thumbnail, feedTitle, str;
+        private Instant getArticleInstant(SyndEntry entry) {
                 Date date;
                 
+                date = entry.getUpdatedDate();       
+                if (date == null)
+                        date = entry.getPublishedDate();
+
+                if (date == null)
+                        return Instant.now();
+                
+                return date.toInstant();
+        }
+        
+        private Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang, Instant instant) {
+                String desc, title, thumbnail, feedTitle, str;
+                List<String> entities;
+                
                 feedTitle = feed.getTitle();
                 if (feedTitle != null) {
                         feedTitle = feedTitle.trim();
@@ -86,11 +106,7 @@ public class ArticleProvider {
                                 thumbnail = e.getUrl();    
                         break;
                 }
-                
-                if (thumbnail == null && feed.getImage() != null)
-                        thumbnail = feed.getImage().getUrl();
-                             
-                
+                                
                 title = entry.getTitle().trim();
                 
                 if (entry.getDescription() != null) {
@@ -100,14 +116,18 @@ public class ArticleProvider {
                         desc = null;
                         LOG.severe("No description for " + feedTitle + " - " + title);
                 }
+                                
+                entities = new ArrayList<>();
+                if (lang.equals("en"))
+                        try {
+                                NamedEntityRecognizer.classify(title, entities, config);
+                                if (desc != null)
+                                        NamedEntityRecognizer.classify(desc, entities, config);
+                        } catch (ClassCastException | ClassNotFoundException | IOException e1) {
+                                LOG.log(Level.SEVERE, "Cannot classify " + feedTitle, e1);                         
+                        }
                 
-                date = entry.getPublishedDate();
-                if (date == null)
-                        date = entry.getUpdatedDate();
-                if (date == null)
-                        LOG.severe("The article " + feedTitle + " - " + title + " does not have a date");
-                                     
-                return new Article(link, title, desc, thumbnail, date, feedTitle);
+                return new Article(link, title, desc, thumbnail, instant, feedTitle, entities.toArray(new String[0]));
         }
         
         private void addArticles(Category cat, SyndFeed feed) {
@@ -117,7 +137,7 @@ public class ArticleProvider {
                 
                 feedTitle = feed.getTitle().trim();
                 
-                LOG.info("addArticles " + cat.getId() + " " + feedTitle + " number of articles: " + feed.getEntries().size());
+                LOG.info("addArticles " + cat.getLabel() + " " + feedTitle + " number of articles: " + feed.getEntries().size());
                 
                 for (SyndEntry entry: feed.getEntries()) {
                         String link = entry.getLink().trim();
@@ -127,7 +147,12 @@ public class ArticleProvider {
                                 continue ;
                         }
                         
-                        a = ArticleStore.singleton.getArticle(link, ()->toArticle(link, entry, feed));
+                        final Instant instant = getArticleInstant(entry);
+                        
+                        if (config.isObsolete(instant))
+                                continue ;
+                        
+                        a = ArticleStore.singleton.getArticle(link, ()->toArticle(link, entry, feed, cat.getLanguage(), instant));
                         
                         synchronized (articles) {
                                 articles.add(a);
@@ -135,33 +160,33 @@ public class ArticleProvider {
                                 Collections.sort(articles, new Comparator<Article>() {
                                         @Override
                                         public int compare(Article o1, Article o2) {
-                                                if (o1.publicationDate == o2.publicationDate)
+                                                if (o1.getPublicationDate() == o2.getPublicationDate())
                                                         return 0;
-                                                if (o1.publicationDate == null)
+                                                if (o1.getPublicationDate() == null)
                                                         return 1;
-                                                if (o2.publicationDate == null)
+                                                if (o2.getPublicationDate() == null)
                                                         return -1;
-                                                return o2.publicationDate.compareTo(o1.publicationDate);
+                                                return o2.getPublicationDate().compareTo(o1.getPublicationDate());
                                         }
                                 });
                         }
                 }          
                 
-                LOG.info("addArticles done " + cat.getId());
+                LOG.info("addArticles done " + cat.getLabel());
         }
              
         private void retrieveArticles(Category cat) throws IllegalArgumentException, MalformedURLException, FeedException, IOException {
-                String[] feeds;
+                List<Feed> feeds;
                 
-                feeds = Config.getFeedsByCategory().get(cat);
+                feeds = config.getFeedsByCategory().get(cat);
                 
                 if (feeds != null)
-                        for (String str: feeds)
+                        for (Feed f: feeds)
                                 try {
-                                        addArticles(cat, getSyndFeed(str));
+                                        addArticles(cat, getSyndFeed(f.getURL()));
                                 } catch (Throwable e) {
                                         LOG.log(Level.SEVERE,
-                                                "retrieveArticles failure " + cat.getId() + " " + str,
+                                                "retrieveArticles failure " + cat.getLabel() + " " + f.toString(),
                                                 e);
                                 }
                 else
@@ -171,19 +196,68 @@ public class ArticleProvider {
         /**
          * Returns a copy.
          */
-        public List<Article> getArticles(Category cat)
+        public List<Article> getArticles(Category cat, String entity)
                         throws IllegalArgumentException, MalformedURLException, FeedException, IOException {
-                List<Article> articles;
+                List<Article> articles, result;                
                 
                 synchronized (articlesByCategory) {
                         articles = getArticlesForUpdate(cat);
                 }
                 
-                synchronized (articles) {
-                        return new ArrayList<>(articles);
+                synchronized (articles) {                       
+                        if (entity == null)
+                                return new ArrayList<>(articles);
+                        
+                        result = new ArrayList<>(articles.size());
+                        for (Article a: articles)
+                                if (a.hasEntity(entity))
+                                        result.add(a);
+                        
+                        return result;
                 }
         }
         
+        public List<EntityStat> getEntityStats(Category cat) throws IllegalArgumentException, MalformedURLException, FeedException, IOException {
+                List<Article> articles;
+                Map<String, EntityStat> entities;
+                final String FUNCTION_NAME = "getEntities";
+                EntityStat s;
+                List<EntityStat> stats;
+                Instant minInstant;
+                
+                LOG.entering(CLASS_NAME, FUNCTION_NAME, cat);
+                
+                articles = getArticles(cat, null);
+                
+                minInstant = Instant.now().minus(15, ChronoUnit.DAYS);
+                
+                entities = new HashMap<>();
+                for (Article a: articles)
+                        if (a.getPublicationDate().isAfter(minInstant) && a.getEntities() != null)
+                                for (String e: a.getEntities()) {
+                                        s = entities.get(e);
+                                        if (s == null) {
+                                                s = new EntityStat(e);
+                                                entities.put(e,  s);
+                                        }
+                                        s.increment();
+                                }                
+               
+                stats = new ArrayList<>(entities.values());
+                stats.sort(new Comparator<EntityStat>() {
+
+                        @Override
+                        public int compare(EntityStat o1, EntityStat o2) {
+                                return Integer.compare(o2.getCount(), o1.getCount());
+                        }
+                        
+                });
+                
+                LOG.exiting(CLASS_NAME, FUNCTION_NAME, stats);
+                
+                return stats;
+        }
+        
         private class Refresher implements Runnable {
                 private final Category category;
                 
@@ -193,7 +267,7 @@ public class ArticleProvider {
                 
                 @Override
                 public void run() {                       
-                        LOG.info("refresher "+ category.getId());
+                        LOG.info("refresher "+ category.getLabel());
                         
                         try {
                                 retrieveArticles(category);
@@ -201,7 +275,7 @@ public class ArticleProvider {
                                 LOG.log(Level.SEVERE, "refresher failure", e);
                         }                        
                         
-                        LOG.info("refresher "+ category.getId() + " done");
+                        LOG.info("refresher "+ category.getLabel() + " done");
                 }                
         }
 }