3 import java.io.IOException;
4 import java.net.MalformedURLException;
6 import java.util.ArrayList;
7 import java.util.Collections;
8 import java.util.Comparator;
10 import java.util.HashMap;
11 import java.util.List;
13 import java.util.concurrent.Executors;
14 import java.util.concurrent.ScheduledExecutorService;
15 import java.util.concurrent.TimeUnit;
16 import java.util.logging.Level;
17 import java.util.logging.Logger;
19 import org.jsoup.Jsoup;
21 import com.rometools.rome.feed.synd.SyndEnclosure;
22 import com.rometools.rome.feed.synd.SyndEntry;
23 import com.rometools.rome.feed.synd.SyndFeed;
24 import com.rometools.rome.io.FeedException;
25 import com.rometools.rome.io.SyndFeedInput;
26 import com.rometools.rome.io.XmlReader;
29 import pnews.Category;
30 import pnews.EntityStat;
34 public class ArticleProvider {
35 private static final String CLASS_NAME = ArticleProvider.class.getName();
36 private static final Logger LOG = Logger.getLogger(CLASS_NAME);
37 private final Map<Category, List<Article>> articlesByCategory = new HashMap<>();
38 private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(Runtime.getRuntime().availableProcessors());
39 private final Config config;
41 public ArticleProvider(Config config) {
43 for (Category cat: config.getCategories())
44 scheduler.scheduleAtFixedRate(new Refresher(cat), 2, 600, TimeUnit.SECONDS);
47 private static SyndFeed getSyndFeed(String u) throws IllegalArgumentException, FeedException, MalformedURLException, IOException {
50 r = new XmlReader(new URL(u));
52 return new SyndFeedInput().build(r);
55 private List<Article> getArticlesForUpdate(Category cat) {
58 synchronized (articlesByCategory) {
59 result = articlesByCategory.get(cat);
61 result = new ArrayList<>();
62 articlesByCategory.put(cat, result);
68 private boolean exists(String articleLink, List<Article> articles) {
69 synchronized (articles) {
70 for (Article a: articles)
71 if (a.link.equals(articleLink))
77 private static Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang) {
78 String desc, title, thumbnail, feedTitle, str;
80 List<String> entities;
82 feedTitle = feed.getTitle();
83 if (feedTitle != null) {
84 feedTitle = feedTitle.trim();
88 for (SyndEnclosure e: entry.getEnclosures()) {
89 if (e.getType().startsWith("image/"))
90 thumbnail = e.getUrl();
94 if (thumbnail == null && feed.getImage() != null)
95 thumbnail = feed.getImage().getUrl();
98 title = entry.getTitle().trim();
100 if (entry.getDescription() != null) {
101 str = entry.getDescription().getValue();
102 desc = Jsoup.parse(str).text();
105 LOG.severe("No description for " + feedTitle + " - " + title);
108 date = entry.getPublishedDate();
110 date = entry.getUpdatedDate();
112 LOG.severe("The article " + feedTitle + " - " + title + " does not have a date");
115 entities = new ArrayList<>();
116 if (desc != null && lang.equals("en"))
118 NER.classify(title, entities);
119 NER.classify(desc, entities);
120 } catch (ClassCastException | ClassNotFoundException | IOException e1) {
121 LOG.log(Level.SEVERE, "Cannot classify " + feedTitle, e1);
124 return new Article(link, title, desc, thumbnail, date, feedTitle, entities.toArray(new String[0]));
127 private void addArticles(Category cat, SyndFeed feed) {
129 List<Article> articles;
132 feedTitle = feed.getTitle().trim();
134 LOG.info("addArticles " + cat.getLabel() + " " + feedTitle + " number of articles: " + feed.getEntries().size());
136 for (SyndEntry entry: feed.getEntries()) {
137 String link = entry.getLink().trim();
138 articles = getArticlesForUpdate(cat);
139 if (exists(link, articles)) {
140 LOG.fine("addArticles " + link + " is already present");
144 a = ArticleStore.singleton.getArticle(link, ()->toArticle(link, entry, feed, cat.getLanguage()));
146 synchronized (articles) {
149 Collections.sort(articles, new Comparator<Article>() {
151 public int compare(Article o1, Article o2) {
152 if (o1.publicationDate == o2.publicationDate)
154 if (o1.publicationDate == null)
156 if (o2.publicationDate == null)
158 return o2.publicationDate.compareTo(o1.publicationDate);
164 LOG.info("addArticles done " + cat.getLabel());
167 private void retrieveArticles(Category cat) throws IllegalArgumentException, MalformedURLException, FeedException, IOException {
170 feeds = config.getFeedsByCategory().get(cat);
175 addArticles(cat, getSyndFeed(f.getURL()));
176 } catch (Throwable e) {
177 LOG.log(Level.SEVERE,
178 "retrieveArticles failure " + cat.getLabel() + " " + f.toString(),
182 LOG.severe("No feed for category " + cat);
188 public List<Article> getArticles(Category cat)
189 throws IllegalArgumentException, MalformedURLException, FeedException, IOException {
190 List<Article> articles;
192 synchronized (articlesByCategory) {
193 articles = getArticlesForUpdate(cat);
196 synchronized (articles) {
197 return new ArrayList<>(articles);
201 public List<EntityStat> getEntityStats(Category cat) throws IllegalArgumentException, MalformedURLException, FeedException, IOException {
202 List<Article> articles;
203 Map<String, EntityStat> entities;
204 final String FUNCTION_NAME = "getEntities";
206 List<EntityStat> stats;
208 LOG.entering(CLASS_NAME, FUNCTION_NAME, cat);
210 articles = getArticles(cat);
212 entities = new HashMap<>();
213 for (Article a: articles)
214 if (a.getEntities() != null) {
215 for (String e: a.getEntities()) {
218 s = new EntityStat(e);
225 stats = new ArrayList<>(entities.values());
226 stats.sort(new Comparator<EntityStat>() {
229 public int compare(EntityStat o1, EntityStat o2) {
230 return Integer.compare(o2.getCount(), o1.getCount());
235 LOG.exiting(CLASS_NAME, FUNCTION_NAME, stats);
240 private class Refresher implements Runnable {
241 private final Category category;
243 public Refresher(Category category) {
244 this.category = category;
249 LOG.info("refresher "+ category.getLabel());
252 retrieveArticles(category);
253 } catch (IllegalArgumentException | FeedException | IOException e) {
254 LOG.log(Level.SEVERE, "refresher failure", e);
257 LOG.info("refresher "+ category.getLabel() + " done");