3 import java.io.IOException;
4 import java.net.MalformedURLException;
6 import java.util.ArrayList;
7 import java.util.Collections;
8 import java.util.Comparator;
10 import java.util.HashMap;
11 import java.util.List;
13 import java.util.concurrent.Executors;
14 import java.util.concurrent.ScheduledExecutorService;
15 import java.util.concurrent.TimeUnit;
16 import java.util.logging.Level;
17 import java.util.logging.Logger;
19 import org.jsoup.Jsoup;
21 import com.rometools.rome.feed.synd.SyndEnclosure;
22 import com.rometools.rome.feed.synd.SyndEntry;
23 import com.rometools.rome.feed.synd.SyndFeed;
24 import com.rometools.rome.io.FeedException;
25 import com.rometools.rome.io.SyndFeedInput;
26 import com.rometools.rome.io.XmlReader;
29 import pnews.Category;
30 import pnews.EntityStat;
34 public class ArticleProvider {
35 private static final String CLASS_NAME = ArticleProvider.class.getName();
36 private static final Logger LOG = Logger.getLogger(CLASS_NAME);
37 private final Map<Category, List<Article>> articlesByCategory = new HashMap<>();
38 private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(Runtime.getRuntime().availableProcessors());
39 private final Config config;
41 public ArticleProvider(Config config) {
43 for (Category cat: config.getCategories())
44 scheduler.scheduleAtFixedRate(new Refresher(cat), 2, 600, TimeUnit.SECONDS);
47 private static SyndFeed getSyndFeed(String u) throws IllegalArgumentException, FeedException, MalformedURLException, IOException {
50 r = new XmlReader(new URL(u));
52 return new SyndFeedInput().build(r);
55 private List<Article> getArticlesForUpdate(Category cat) {
58 synchronized (articlesByCategory) {
59 result = articlesByCategory.get(cat);
61 result = new ArrayList<>();
62 articlesByCategory.put(cat, result);
68 private boolean exists(String articleLink, List<Article> articles) {
69 synchronized (articles) {
70 for (Article a: articles)
71 if (a.link.equals(articleLink))
77 private static Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang) {
78 String desc, title, thumbnail, feedTitle, str;
80 List<String> entities;
82 feedTitle = feed.getTitle();
83 if (feedTitle != null) {
84 feedTitle = feedTitle.trim();
88 for (SyndEnclosure e: entry.getEnclosures()) {
89 if (e.getType().startsWith("image/"))
90 thumbnail = e.getUrl();
94 title = entry.getTitle().trim();
96 if (entry.getDescription() != null) {
97 str = entry.getDescription().getValue();
98 desc = Jsoup.parse(str).text();
101 LOG.severe("No description for " + feedTitle + " - " + title);
104 date = entry.getPublishedDate();
106 date = entry.getUpdatedDate();
108 LOG.severe("The article " + feedTitle + " - " + title + " does not have a date");
111 entities = new ArrayList<>();
112 if (desc != null && lang.equals("en"))
114 NER.classify(title, entities);
115 NER.classify(desc, entities);
116 } catch (ClassCastException | ClassNotFoundException | IOException e1) {
117 LOG.log(Level.SEVERE, "Cannot classify " + feedTitle, e1);
120 return new Article(link, title, desc, thumbnail, date, feedTitle, entities.toArray(new String[0]));
123 private void addArticles(Category cat, SyndFeed feed) {
125 List<Article> articles;
128 feedTitle = feed.getTitle().trim();
130 LOG.info("addArticles " + cat.getLabel() + " " + feedTitle + " number of articles: " + feed.getEntries().size());
132 for (SyndEntry entry: feed.getEntries()) {
133 String link = entry.getLink().trim();
134 articles = getArticlesForUpdate(cat);
135 if (exists(link, articles)) {
136 LOG.fine("addArticles " + link + " is already present");
140 a = ArticleStore.singleton.getArticle(link, ()->toArticle(link, entry, feed, cat.getLanguage()));
142 synchronized (articles) {
145 Collections.sort(articles, new Comparator<Article>() {
147 public int compare(Article o1, Article o2) {
148 if (o1.publicationDate == o2.publicationDate)
150 if (o1.publicationDate == null)
152 if (o2.publicationDate == null)
154 return o2.publicationDate.compareTo(o1.publicationDate);
160 LOG.info("addArticles done " + cat.getLabel());
163 private void retrieveArticles(Category cat) throws IllegalArgumentException, MalformedURLException, FeedException, IOException {
166 feeds = config.getFeedsByCategory().get(cat);
171 addArticles(cat, getSyndFeed(f.getURL()));
172 } catch (Throwable e) {
173 LOG.log(Level.SEVERE,
174 "retrieveArticles failure " + cat.getLabel() + " " + f.toString(),
178 LOG.severe("No feed for category " + cat);
184 public List<Article> getArticles(Category cat)
185 throws IllegalArgumentException, MalformedURLException, FeedException, IOException {
186 List<Article> articles;
188 synchronized (articlesByCategory) {
189 articles = getArticlesForUpdate(cat);
192 synchronized (articles) {
193 return new ArrayList<>(articles);
197 public List<EntityStat> getEntityStats(Category cat) throws IllegalArgumentException, MalformedURLException, FeedException, IOException {
198 List<Article> articles;
199 Map<String, EntityStat> entities;
200 final String FUNCTION_NAME = "getEntities";
202 List<EntityStat> stats;
204 LOG.entering(CLASS_NAME, FUNCTION_NAME, cat);
206 articles = getArticles(cat);
208 entities = new HashMap<>();
209 for (Article a: articles)
210 if (a.getEntities() != null) {
211 for (String e: a.getEntities()) {
214 s = new EntityStat(e);
221 stats = new ArrayList<>(entities.values());
222 stats.sort(new Comparator<EntityStat>() {
225 public int compare(EntityStat o1, EntityStat o2) {
226 return Integer.compare(o2.getCount(), o1.getCount());
231 LOG.exiting(CLASS_NAME, FUNCTION_NAME, stats);
236 private class Refresher implements Runnable {
237 private final Category category;
239 public Refresher(Category category) {
240 this.category = category;
245 LOG.info("refresher "+ category.getLabel());
248 retrieveArticles(category);
249 } catch (IllegalArgumentException | FeedException | IOException e) {
250 LOG.log(Level.SEVERE, "refresher failure", e);
253 LOG.info("refresher "+ category.getLabel() + " done");