do not load articles older than 60 days
[pnews.git] / war / src / main / java / pnews / servlet / ArticleProvider.java
1 package pnews.servlet;
2
3 import java.io.IOException;
4 import java.net.MalformedURLException;
5 import java.net.URL;
6 import java.time.Instant;
7 import java.util.ArrayList;
8 import java.util.Collections;
9 import java.util.Comparator;
10 import java.util.Date;
11 import java.util.HashMap;
12 import java.util.List;
13 import java.util.Map;
14 import java.util.concurrent.Executors;
15 import java.util.concurrent.ScheduledExecutorService;
16 import java.util.concurrent.TimeUnit;
17 import java.util.logging.Level;
18 import java.util.logging.Logger;
19
20 import org.jsoup.Jsoup;
21
22 import com.rometools.rome.feed.synd.SyndEnclosure;
23 import com.rometools.rome.feed.synd.SyndEntry;
24 import com.rometools.rome.feed.synd.SyndFeed;
25 import com.rometools.rome.io.FeedException;
26 import com.rometools.rome.io.SyndFeedInput;
27 import com.rometools.rome.io.XmlReader;
28
29 import pnews.Article;
30 import pnews.Category;
31 import pnews.EntityStat;
32 import pnews.Feed;
33 import pnews.NER;
34
35 public class ArticleProvider {
36         private static final String CLASS_NAME = ArticleProvider.class.getName();
37         private static final Logger LOG = Logger.getLogger(CLASS_NAME);
38         private final Map<Category, List<Article>> articlesByCategory = new HashMap<>();
39         private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(Runtime.getRuntime().availableProcessors());
40         private final Config config;
41         
42         public ArticleProvider(Config config) {
43                 this.config = config;
44                 for (Category cat: config.getCategories())
45                         scheduler.scheduleAtFixedRate(new Refresher(cat), 2, 600, TimeUnit.SECONDS);
46         }
47         
48         private static SyndFeed getSyndFeed(String u) throws IllegalArgumentException, FeedException, MalformedURLException, IOException {
49                 XmlReader r;
50                 
51                 r = new XmlReader(new URL(u));
52                 
53                 return new SyndFeedInput().build(r);                
54         }
55         
56         private List<Article> getArticlesForUpdate(Category cat) {
57                 List<Article> result;
58                 
59                 synchronized (articlesByCategory) {
60                         result = articlesByCategory.get(cat);
61                         if (result == null) {
62                                 result = new ArrayList<>();
63                                 articlesByCategory.put(cat, result);
64                         }
65                         return result;
66                 }                
67         }
68         
69         private boolean exists(String articleLink, List<Article> articles) {
70                 synchronized (articles) {
71                         for (Article a: articles)
72                                 if (a.link.equals(articleLink))
73                                         return true;
74                 }
75                 return false;
76         }
77         
78         private Instant getArticleInstant(SyndEntry entry) {
79                 Date date;
80                 
81                 date = entry.getUpdatedDate();       
82                 if (date == null)
83                         date = entry.getPublishedDate();
84
85                 if (date == null)
86                         return Instant.now();
87                 
88                 return date.toInstant();
89         }
90         
91         private Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang, Instant instant) {
92                 String desc, title, thumbnail, feedTitle, str;
93                 Date date;
94                 List<String> entities;
95                 
96                 feedTitle = feed.getTitle();
97                 if (feedTitle != null) {
98                         feedTitle = feedTitle.trim();
99                 }
100                 
101                 thumbnail = null;
102                 for (SyndEnclosure e: entry.getEnclosures()) {
103                         if (e.getType().startsWith("image/"))
104                                 thumbnail = e.getUrl();    
105                         break;
106                 }
107                                 
108                 title = entry.getTitle().trim();
109                 
110                 if (entry.getDescription() != null) {
111                         str = entry.getDescription().getValue();
112                         desc = Jsoup.parse(str).text();
113                 } else {       
114                         desc = null;
115                         LOG.severe("No description for " + feedTitle + " - " + title);
116                 }
117                                 
118                 entities = new ArrayList<>();
119                 if (desc != null && lang.equals("en"))
120                         try {
121                                 NER.classify(title, entities, config);
122                                 NER.classify(desc, entities, config);
123                         } catch (ClassCastException | ClassNotFoundException | IOException e1) {
124                                 LOG.log(Level.SEVERE, "Cannot classify " + feedTitle, e1);                         
125                         }
126                 
127                 return new Article(link, title, desc, thumbnail, instant, feedTitle, entities.toArray(new String[0]));
128         }
129         
130         private void addArticles(Category cat, SyndFeed feed) {
131                 String feedTitle;
132                 List<Article> articles;
133                 Article a;
134                 
135                 feedTitle = feed.getTitle().trim();
136                 
137                 LOG.info("addArticles " + cat.getLabel() + " " + feedTitle + " number of articles: " + feed.getEntries().size());
138                 
139                 for (SyndEntry entry: feed.getEntries()) {
140                         String link = entry.getLink().trim();
141                         articles = getArticlesForUpdate(cat);
142                         if (exists(link, articles)) {
143                                 LOG.fine("addArticles " + link + " is already present");
144                                 continue ;
145                         }
146                         
147                         final Instant instant = getArticleInstant(entry);
148                         
149                         if (config.isObsolete(instant))
150                                 continue ;
151                         
152                         a = ArticleStore.singleton.getArticle(link, ()->toArticle(link, entry, feed, cat.getLanguage(), instant));
153                         
154                         synchronized (articles) {
155                                 articles.add(a);
156
157                                 Collections.sort(articles, new Comparator<Article>() {
158                                         @Override
159                                         public int compare(Article o1, Article o2) {
160                                                 if (o1.publicationDate == o2.publicationDate)
161                                                         return 0;
162                                                 if (o1.publicationDate == null)
163                                                         return 1;
164                                                 if (o2.publicationDate == null)
165                                                         return -1;
166                                                 return o2.publicationDate.compareTo(o1.publicationDate);
167                                         }
168                                 });
169                         }
170                 }          
171                 
172                 LOG.info("addArticles done " + cat.getLabel());
173         }
174              
175         private void retrieveArticles(Category cat) throws IllegalArgumentException, MalformedURLException, FeedException, IOException {
176                 List<Feed> feeds;
177                 
178                 feeds = config.getFeedsByCategory().get(cat);
179                 
180                 if (feeds != null)
181                         for (Feed f: feeds)
182                                 try {
183                                         addArticles(cat, getSyndFeed(f.getURL()));
184                                 } catch (Throwable e) {
185                                         LOG.log(Level.SEVERE,
186                                                 "retrieveArticles failure " + cat.getLabel() + " " + f.toString(),
187                                                 e);
188                                 }
189                 else
190                         LOG.severe("No feed for category " + cat);
191         }
192         
193         /**
194          * Returns a copy.
195          */
196         public List<Article> getArticles(Category cat, String entity)
197                         throws IllegalArgumentException, MalformedURLException, FeedException, IOException {
198                 List<Article> articles, result;                
199                 
200                 synchronized (articlesByCategory) {
201                         articles = getArticlesForUpdate(cat);
202                 }
203                 
204                 synchronized (articles) {                       
205                         if (entity == null)
206                                 return new ArrayList<>(articles);
207                         
208                         result = new ArrayList<>(articles.size());
209                         for (Article a: articles)
210                                 if (a.hasEntity(entity))
211                                         result.add(a);
212                         
213                         return result;
214                 }
215         }
216         
217         public List<EntityStat> getEntityStats(Category cat) throws IllegalArgumentException, MalformedURLException, FeedException, IOException {
218                 List<Article> articles;
219                 Map<String, EntityStat> entities;
220                 final String FUNCTION_NAME = "getEntities";
221                 EntityStat s;
222                 List<EntityStat> stats;
223                 
224                 LOG.entering(CLASS_NAME, FUNCTION_NAME, cat);
225                 
226                 articles = getArticles(cat, null);
227                 
228                 entities = new HashMap<>();
229                 for (Article a: articles) 
230                         if (a.getEntities() != null) {
231                                 for (String e: a.getEntities()) {
232                                         s = entities.get(e);
233                                         if (s == null) {
234                                                 s = new EntityStat(e);
235                                                 entities.put(e,  s);
236                                         }
237                                         s.increment();
238                                 }                
239                         }
240                 
241                 stats = new ArrayList<>(entities.values());
242                 stats.sort(new Comparator<EntityStat>() {
243
244                         @Override
245                         public int compare(EntityStat o1, EntityStat o2) {
246                                 return Integer.compare(o2.getCount(), o1.getCount());
247                         }
248                         
249                 });
250                 
251                 LOG.exiting(CLASS_NAME, FUNCTION_NAME, stats);
252                 
253                 return stats;
254         }
255         
256         private class Refresher implements Runnable {
257                 private final Category category;
258                 
259                 public Refresher(Category category) {
260                         this.category = category;
261                 }
262                 
263                 @Override
264                 public void run() {                       
265                         LOG.info("refresher "+ category.getLabel());
266                         
267                         try {
268                                 retrieveArticles(category);
269                         } catch (IllegalArgumentException | FeedException | IOException e) {
270                                 LOG.log(Level.SEVERE, "refresher failure", e);
271                         }                        
272                         
273                         LOG.info("refresher "+ category.getLabel() + " done");
274                 }                
275         }
276 }