X-Git-Url: https://git.wpitchoune.net/gitweb/?p=pnews.git;a=blobdiff_plain;f=war%2Fsrc%2Fmain%2Fjava%2Fnet%2Fwpitchoune%2Fpnews%2Fservlet%2FArticleProvider.java;fp=war%2Fsrc%2Fmain%2Fjava%2Fnet%2Fwpitchoune%2Fpnews%2Fservlet%2FArticleProvider.java;h=d52d78081d916d50baeafa59b9c918d7c6e4a34b;hp=0000000000000000000000000000000000000000;hb=a0c6addfd9ac6ac45f37b4202e787602c40e6bf7;hpb=aff83c8798602b535d13edeaffdb8f4238e2bbf5 diff --git a/war/src/main/java/net/wpitchoune/pnews/servlet/ArticleProvider.java b/war/src/main/java/net/wpitchoune/pnews/servlet/ArticleProvider.java new file mode 100644 index 0000000..d52d780 --- /dev/null +++ b/war/src/main/java/net/wpitchoune/pnews/servlet/ArticleProvider.java @@ -0,0 +1,281 @@ +package net.wpitchoune.pnews.servlet; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.time.Instant; +import java.time.temporal.ChronoUnit; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.jsoup.Jsoup; + +import com.rometools.rome.feed.synd.SyndEnclosure; +import com.rometools.rome.feed.synd.SyndEntry; +import com.rometools.rome.feed.synd.SyndFeed; +import com.rometools.rome.io.FeedException; +import com.rometools.rome.io.SyndFeedInput; +import com.rometools.rome.io.XmlReader; + +import net.wpitchoune.pnews.Article; +import net.wpitchoune.pnews.ArticleStore; +import net.wpitchoune.pnews.Category; +import net.wpitchoune.pnews.Config; +import net.wpitchoune.pnews.EntityStat; +import net.wpitchoune.pnews.Feed; +import net.wpitchoune.pnews.classifier.NamedEntityRecognizer; + +public class ArticleProvider { + private static final String CLASS_NAME = ArticleProvider.class.getName(); + private static final Logger LOG = Logger.getLogger(CLASS_NAME); + private final Map> articlesByCategory = new HashMap<>(); + private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(Runtime.getRuntime().availableProcessors()); + private final Config config; + + public ArticleProvider(Config config) { + this.config = config; + for (Category cat: config.getCategories()) + scheduler.scheduleAtFixedRate(new Refresher(cat), 2, 600, TimeUnit.SECONDS); + } + + private static SyndFeed getSyndFeed(String u) throws IllegalArgumentException, FeedException, MalformedURLException, IOException { + XmlReader r; + + r = new XmlReader(new URL(u)); + + return new SyndFeedInput().build(r); + } + + private List
getArticlesForUpdate(Category cat) { + List
result; + + synchronized (articlesByCategory) { + result = articlesByCategory.get(cat); + if (result == null) { + result = new ArrayList<>(); + articlesByCategory.put(cat, result); + } + return result; + } + } + + private boolean exists(String articleLink, List
articles) { + synchronized (articles) { + for (Article a: articles) + if (a.getLink().equals(articleLink)) + return true; + } + return false; + } + + private Instant getArticleInstant(SyndEntry entry) { + Date date; + + date = entry.getUpdatedDate(); + if (date == null) + date = entry.getPublishedDate(); + + if (date == null) + return Instant.now(); + + return date.toInstant(); + } + + private Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang, Instant instant) { + String desc, title, thumbnail, feedTitle, str; + List entities; + + feedTitle = feed.getTitle(); + if (feedTitle != null) { + feedTitle = feedTitle.trim(); + } + + thumbnail = null; + for (SyndEnclosure e: entry.getEnclosures()) { + if (e.getType().startsWith("image/")) + thumbnail = e.getUrl(); + break; + } + + title = entry.getTitle().trim(); + + if (entry.getDescription() != null) { + str = entry.getDescription().getValue(); + desc = Jsoup.parse(str).text(); + } else { + desc = null; + LOG.severe("No description for " + feedTitle + " - " + title); + } + + entities = new ArrayList<>(); + if (lang.equals("en")) + try { + NamedEntityRecognizer.classify(title, entities, config); + if (desc != null) + NamedEntityRecognizer.classify(desc, entities, config); + } catch (ClassCastException | ClassNotFoundException | IOException e1) { + LOG.log(Level.SEVERE, "Cannot classify " + feedTitle, e1); + } + + return new Article(link, title, desc, thumbnail, instant, feedTitle, entities.toArray(new String[0])); + } + + private void addArticles(Category cat, SyndFeed feed) { + String feedTitle; + List
articles; + Article a; + + feedTitle = feed.getTitle().trim(); + + LOG.info("addArticles " + cat.getLabel() + " " + feedTitle + " number of articles: " + feed.getEntries().size()); + + for (SyndEntry entry: feed.getEntries()) { + String link = entry.getLink().trim(); + articles = getArticlesForUpdate(cat); + if (exists(link, articles)) { + LOG.fine("addArticles " + link + " is already present"); + continue ; + } + + final Instant instant = getArticleInstant(entry); + + if (config.isObsolete(instant)) + continue ; + + a = ArticleStore.singleton.getArticle(link, ()->toArticle(link, entry, feed, cat.getLanguage(), instant)); + + synchronized (articles) { + articles.add(a); + + Collections.sort(articles, new Comparator
() { + @Override + public int compare(Article o1, Article o2) { + if (o1.getPublicationDate() == o2.getPublicationDate()) + return 0; + if (o1.getPublicationDate() == null) + return 1; + if (o2.getPublicationDate() == null) + return -1; + return o2.getPublicationDate().compareTo(o1.getPublicationDate()); + } + }); + } + } + + LOG.info("addArticles done " + cat.getLabel()); + } + + private void retrieveArticles(Category cat) throws IllegalArgumentException, MalformedURLException, FeedException, IOException { + List feeds; + + feeds = config.getFeedsByCategory().get(cat); + + if (feeds != null) + for (Feed f: feeds) + try { + addArticles(cat, getSyndFeed(f.getURL())); + } catch (Throwable e) { + LOG.log(Level.SEVERE, + "retrieveArticles failure " + cat.getLabel() + " " + f.toString(), + e); + } + else + LOG.severe("No feed for category " + cat); + } + + /** + * Returns a copy. + */ + public List
getArticles(Category cat, String entity) + throws IllegalArgumentException, MalformedURLException, FeedException, IOException { + List
articles, result; + + synchronized (articlesByCategory) { + articles = getArticlesForUpdate(cat); + } + + synchronized (articles) { + if (entity == null) + return new ArrayList<>(articles); + + result = new ArrayList<>(articles.size()); + for (Article a: articles) + if (a.hasEntity(entity)) + result.add(a); + + return result; + } + } + + public List getEntityStats(Category cat) throws IllegalArgumentException, MalformedURLException, FeedException, IOException { + List
articles; + Map entities; + final String FUNCTION_NAME = "getEntities"; + EntityStat s; + List stats; + Instant minInstant; + + LOG.entering(CLASS_NAME, FUNCTION_NAME, cat); + + articles = getArticles(cat, null); + + minInstant = Instant.now().minus(15, ChronoUnit.DAYS); + + entities = new HashMap<>(); + for (Article a: articles) + if (a.getPublicationDate().isAfter(minInstant) && a.getEntities() != null) + for (String e: a.getEntities()) { + s = entities.get(e); + if (s == null) { + s = new EntityStat(e); + entities.put(e, s); + } + s.increment(); + } + + stats = new ArrayList<>(entities.values()); + stats.sort(new Comparator() { + + @Override + public int compare(EntityStat o1, EntityStat o2) { + return Integer.compare(o2.getCount(), o1.getCount()); + } + + }); + + LOG.exiting(CLASS_NAME, FUNCTION_NAME, stats); + + return stats; + } + + private class Refresher implements Runnable { + private final Category category; + + public Refresher(Category category) { + this.category = category; + } + + @Override + public void run() { + LOG.info("refresher "+ category.getLabel()); + + try { + retrieveArticles(category); + } catch (IllegalArgumentException | FeedException | IOException e) { + LOG.log(Level.SEVERE, "refresher failure", e); + } + + LOG.info("refresher "+ category.getLabel() + " done"); + } + } +}