From: Jean-Philippe Orsini Date: Tue, 31 Oct 2017 00:51:16 +0000 (+0100) Subject: added openlp support X-Git-Url: http://git.wpitchoune.net/gitweb/?p=pnews.git;a=commitdiff_plain;h=c6f5722d9572158f0d561819f080f8ccf4e3c7d9 added openlp support --- diff --git a/war/pom.xml b/war/pom.xml index db748f7..cf764a6 100644 --- a/war/pom.xml +++ b/war/pom.xml @@ -45,5 +45,10 @@ 3.8.0 models + + org.apache.opennlp + opennlp-tools + 1.8.1 + diff --git a/war/src/main/java/pnews/Article.java b/war/src/main/java/pnews/Article.java index 70f69ea..f38f792 100644 --- a/war/src/main/java/pnews/Article.java +++ b/war/src/main/java/pnews/Article.java @@ -26,4 +26,8 @@ public class Article { public String[] getEntities() { return entities; } + + public Date getPublicationDate() { + return publicationDate; + } } diff --git a/war/src/main/java/pnews/NER.java b/war/src/main/java/pnews/NER.java index f8238c1..ac34c08 100644 --- a/war/src/main/java/pnews/NER.java +++ b/war/src/main/java/pnews/NER.java @@ -14,19 +14,19 @@ public class NER { private static final String CLASS_NAME = NER.class.getName(); private static final Logger LOG = Logger.getLogger(CLASS_NAME); - public static String[] classify(String str) throws ClassCastException, ClassNotFoundException, IOException { + public static List classify(String str, List entities) throws ClassCastException, ClassNotFoundException, IOException { CRFClassifier classifier; List> out; String cat, w; - List entities; final String FUNCTION_NAME = "classify"; LOG.entering(CLASS_NAME, FUNCTION_NAME, str); + + OpenNLP.classify(str, entities); classifier = CRFClassifier.getDefaultClassifier(); out = classifier.classify(str); - entities = new ArrayList<>(); for (List labels: out) for (CoreLabel l: labels) { cat = l.getString(AnswerAnnotation.class); @@ -35,12 +35,17 @@ public class NER { entities.add(w); } + entities.remove("CNET"); + entities.remove("Read More"); + entities.remove("New"); + entities.remove("App"); + LOG.exiting(CLASS_NAME, FUNCTION_NAME, entities); - return entities.toArray(new String[0]); + return entities; } public static void main(String[] args) throws Exception { - classify("I live in Washington."); + classify("I live in Washington.", new ArrayList<>()); } } \ No newline at end of file diff --git a/war/src/main/java/pnews/OpenNLP.java b/war/src/main/java/pnews/OpenNLP.java new file mode 100644 index 0000000..07fbba5 --- /dev/null +++ b/war/src/main/java/pnews/OpenNLP.java @@ -0,0 +1,115 @@ +package pnews; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; +import java.util.logging.Logger; + +import opennlp.tools.namefind.NameFinderME; +import opennlp.tools.namefind.TokenNameFinderModel; +import opennlp.tools.tokenize.TokenizerME; +import opennlp.tools.tokenize.TokenizerModel; +import opennlp.tools.util.Span; + +/** http://www.devglan.com/artificial-intelligence/opennlp-named-entity-recognition-example **/ +public class OpenNLP { + private static final String CLASS_NAME = OpenNLP.class.getName(); + private static final Logger LOG = Logger.getLogger(CLASS_NAME); + private static TokenNameFinderModel organizationModel; + private static TokenNameFinderModel personModel; + private static TokenNameFinderModel locationModel; + private static TokenNameFinderModel timeModel; + private static TokenizerModel tokenModel; + + public static List classify(String str, List entities) throws IOException { + classify(str, getOrganizationModel(), entities); + + classify(str, getPersonModel(), entities); + classify(str, getLocationModel(), entities); + + classify(str, getTimeModel(), entities); + + return entities; + } + + private static TokenNameFinderModel getOrganizationModel() throws IOException { + synchronized (OpenNLP.class) { + if (organizationModel == null) { + InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-organization.bin"); + organizationModel = new TokenNameFinderModel(inputStream); + } + } + + return organizationModel; + } + + private static TokenNameFinderModel getPersonModel() throws IOException { + synchronized (OpenNLP.class) { + if (personModel == null) { + InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-person.bin"); + personModel = new TokenNameFinderModel(inputStream); + } + } + + return personModel; + } + + private static TokenNameFinderModel getLocationModel() throws IOException { + synchronized (OpenNLP.class) { + if (locationModel == null) { + InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-location.bin"); + locationModel = new TokenNameFinderModel(inputStream); + } + } + + return locationModel; + } + + private static TokenNameFinderModel getTimeModel() throws IOException { + synchronized (OpenNLP.class) { + if (timeModel == null) { + InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-time.bin"); + timeModel = new TokenNameFinderModel(inputStream); + } + } + + return timeModel; + } + + private static List classify(String str, TokenNameFinderModel model, List entities) throws IOException { + String entity; + + NameFinderME nameFinder = new NameFinderME(model); + String[] tokens = tokenize(str); + Span nameSpans[] = nameFinder.find(tokens); + + for(Span s: nameSpans) { + if (s.getProb() < 0.60) + continue ; + + entity = null; + for (int i = s.getStart(); i < s.getEnd(); i++) + if (entity == null) + entity = tokens[i]; + else + entity += " " + tokens[i]; + + LOG.finest(entity + " " + s.getProb() + " " + s.toString()); + if (!entities.contains(entity)) + entities.add(entity); + } + + return entities; + } + + public static String[] tokenize(String sentence) throws IOException { + synchronized (OpenNLP.class) { + if (tokenModel == null) { + InputStream inputStreamTokenizer = OpenNLP.class.getResourceAsStream("/en-token.bin"); + tokenModel = new TokenizerModel(inputStreamTokenizer); + } + } + TokenizerME tokenizer = new TokenizerME(tokenModel); + return tokenizer.tokenize(sentence); + } +} diff --git a/war/src/main/java/pnews/servlet/ArticleProvider.java b/war/src/main/java/pnews/servlet/ArticleProvider.java index 05ddedd..5efea15 100644 --- a/war/src/main/java/pnews/servlet/ArticleProvider.java +++ b/war/src/main/java/pnews/servlet/ArticleProvider.java @@ -35,7 +35,7 @@ public class ArticleProvider { private static final String CLASS_NAME = ArticleProvider.class.getName(); private static final Logger LOG = Logger.getLogger(CLASS_NAME); private final Map> articlesByCategory = new HashMap<>(); - private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(2); + private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(Runtime.getRuntime().availableProcessors()); private final Config config; public ArticleProvider(Config config) { @@ -77,7 +77,7 @@ public class ArticleProvider { private static Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang) { String desc, title, thumbnail, feedTitle, str; Date date; - String[] entities; + List entities; feedTitle = feed.getTitle(); if (feedTitle != null) { @@ -112,15 +112,16 @@ public class ArticleProvider { LOG.severe("The article " + feedTitle + " - " + title + " does not have a date"); - entities = null; + entities = new ArrayList<>(); if (desc != null && lang.equals("en")) try { - entities = NER.classify(desc); + NER.classify(title, entities); + NER.classify(desc, entities); } catch (ClassCastException | ClassNotFoundException | IOException e1) { LOG.log(Level.SEVERE, "Cannot classify " + feedTitle, e1); } - return new Article(link, title, desc, thumbnail, date, feedTitle, entities); + return new Article(link, title, desc, thumbnail, date, feedTitle, entities.toArray(new String[0])); } private void addArticles(Category cat, SyndFeed feed) { @@ -210,7 +211,7 @@ public class ArticleProvider { entities = new HashMap<>(); for (Article a: articles) - if (a.getEntities() != null) + if (a.getEntities() != null) { for (String e: a.getEntities()) { s = entities.get(e); if (s == null) { @@ -219,6 +220,7 @@ public class ArticleProvider { } s.increment(); } + } stats = new ArrayList<>(entities.values()); stats.sort(new Comparator() { diff --git a/war/src/main/resources/en-ner-date.bin b/war/src/main/resources/en-ner-date.bin new file mode 100644 index 0000000..a69923a Binary files /dev/null and b/war/src/main/resources/en-ner-date.bin differ diff --git a/war/src/main/resources/en-ner-location.bin b/war/src/main/resources/en-ner-location.bin new file mode 100644 index 0000000..f3788bc Binary files /dev/null and b/war/src/main/resources/en-ner-location.bin differ diff --git a/war/src/main/resources/en-ner-organization.bin b/war/src/main/resources/en-ner-organization.bin new file mode 100644 index 0000000..1fb6d9f Binary files /dev/null and b/war/src/main/resources/en-ner-organization.bin differ diff --git a/war/src/main/resources/en-ner-person.bin b/war/src/main/resources/en-ner-person.bin new file mode 100644 index 0000000..2f68318 Binary files /dev/null and b/war/src/main/resources/en-ner-person.bin differ diff --git a/war/src/main/resources/en-ner-time.bin b/war/src/main/resources/en-ner-time.bin new file mode 100644 index 0000000..a5d8aa1 Binary files /dev/null and b/war/src/main/resources/en-ner-time.bin differ diff --git a/war/src/main/resources/en-token.bin b/war/src/main/resources/en-token.bin new file mode 100644 index 0000000..c417277 Binary files /dev/null and b/war/src/main/resources/en-token.bin differ diff --git a/war/src/main/resources/feeds.json b/war/src/main/resources/feeds.json index c0d625f..e4d9079 100644 --- a/war/src/main/resources/feeds.json +++ b/war/src/main/resources/feeds.json @@ -103,6 +103,8 @@ "http://feeds.macrumors.com/MacRumors-All": { "categories": ["en_technologie"]}, "https://www.digitaltrends.com/rss-home/": { "categories": ["en_technologie"]}, "http://www.zdnet.com/news/rss.xml": { "categories": ["en_technologie"]}, - "https://www.androidheadlines.com/feed": { "categories": ["en_technologie"]} + "https://www.androidheadlines.com/feed": { "categories": ["en_technologie"]}, + "https://www.nasa.gov/rss/dyn/breaking_news.rss": { "categories": ["en_technologie"]}, + "http://www.computerweekly.com/rss/RSS-Feed.xml": { "categories": ["en_technologie"]} } }