added openlp support
authorJean-Philippe Orsini <orsinije@fr.ibm.com>
Tue, 31 Oct 2017 00:51:16 +0000 (01:51 +0100)
committerJean-Philippe Orsini <orsinije@fr.ibm.com>
Tue, 31 Oct 2017 00:51:16 +0000 (01:51 +0100)
12 files changed:
war/pom.xml
war/src/main/java/pnews/Article.java
war/src/main/java/pnews/NER.java
war/src/main/java/pnews/OpenNLP.java [new file with mode: 0644]
war/src/main/java/pnews/servlet/ArticleProvider.java
war/src/main/resources/en-ner-date.bin [new file with mode: 0644]
war/src/main/resources/en-ner-location.bin [new file with mode: 0644]
war/src/main/resources/en-ner-organization.bin [new file with mode: 0644]
war/src/main/resources/en-ner-person.bin [new file with mode: 0644]
war/src/main/resources/en-ner-time.bin [new file with mode: 0644]
war/src/main/resources/en-token.bin [new file with mode: 0644]
war/src/main/resources/feeds.json

index db748f7..cf764a6 100644 (file)
                         <version>3.8.0</version>
                         <classifier>models</classifier>
                 </dependency>
+                <dependency>
+                         <groupId>org.apache.opennlp</groupId>
+                         <artifactId>opennlp-tools</artifactId>
+                         <version>1.8.1</version>
+                </dependency>
         </dependencies>
 </project>
index 70f69ea..f38f792 100644 (file)
@@ -26,4 +26,8 @@ public class Article {
         public String[] getEntities() {
                 return entities;
         }
+        
+        public Date getPublicationDate() {
+                return publicationDate;
+        }
 }
index f8238c1..ac34c08 100644 (file)
@@ -14,19 +14,19 @@ public class NER {
         private static final String CLASS_NAME = NER.class.getName();
         private static final Logger LOG = Logger.getLogger(CLASS_NAME); 
         
-        public static String[] classify(String str) throws ClassCastException, ClassNotFoundException, IOException {
+        public static List<String> classify(String str, List<String> entities) throws ClassCastException, ClassNotFoundException, IOException {
                 CRFClassifier<CoreLabel> classifier;
                 List<List<CoreLabel>> out;
                 String cat, w;
-                List<String> entities;
                 final String FUNCTION_NAME = "classify";                
                 
                 LOG.entering(CLASS_NAME, FUNCTION_NAME, str);
+
+                OpenNLP.classify(str, entities);
                 
                 classifier = CRFClassifier.getDefaultClassifier();
                 out = classifier.classify(str);
                 
-                entities = new ArrayList<>();
                 for (List<CoreLabel> labels: out)
                         for (CoreLabel l: labels) {
                                 cat = l.getString(AnswerAnnotation.class);
@@ -35,12 +35,17 @@ public class NER {
                                         entities.add(w);
                         }
                 
+                entities.remove("CNET");
+                entities.remove("Read More");
+                entities.remove("New");
+                entities.remove("App");
+                
                 LOG.exiting(CLASS_NAME, FUNCTION_NAME, entities);
                 
-                return entities.toArray(new String[0]);
+                return entities;
         }
         
         public static void main(String[] args) throws Exception {
-                classify("I live in Washington.");
+                classify("I live in Washington.", new ArrayList<>());
         }
 }
\ No newline at end of file
diff --git a/war/src/main/java/pnews/OpenNLP.java b/war/src/main/java/pnews/OpenNLP.java
new file mode 100644 (file)
index 0000000..07fbba5
--- /dev/null
@@ -0,0 +1,115 @@
+package pnews;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+import java.util.logging.Logger;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.Span;
+
+/** http://www.devglan.com/artificial-intelligence/opennlp-named-entity-recognition-example **/
+public class OpenNLP {
+        private static final String CLASS_NAME = OpenNLP.class.getName();
+        private static final Logger LOG = Logger.getLogger(CLASS_NAME); 
+        private static TokenNameFinderModel organizationModel;
+        private static TokenNameFinderModel personModel;
+        private static TokenNameFinderModel locationModel;
+        private static TokenNameFinderModel timeModel;
+        private static TokenizerModel tokenModel;
+
+        public static List<String> classify(String str, List<String> entities) throws IOException {
+                classify(str, getOrganizationModel(), entities);
+                
+                classify(str, getPersonModel(), entities);
+                classify(str, getLocationModel(), entities);
+                
+                classify(str, getTimeModel(), entities);                
+                
+                return entities;
+        }
+        
+        private static TokenNameFinderModel getOrganizationModel() throws IOException {
+                synchronized (OpenNLP.class) {
+                        if (organizationModel == null) {
+                                InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-organization.bin");
+                                organizationModel = new TokenNameFinderModel(inputStream);
+                        }
+                }
+                
+                return organizationModel;
+        }
+
+        private static TokenNameFinderModel getPersonModel() throws IOException {
+                synchronized (OpenNLP.class) {
+                        if (personModel == null) {
+                                InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-person.bin");
+                                personModel = new TokenNameFinderModel(inputStream);
+                        }
+                }
+                
+                return personModel;
+        }
+
+        private static TokenNameFinderModel getLocationModel() throws IOException {
+                synchronized (OpenNLP.class) {
+                        if (locationModel == null) {
+                                InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-location.bin");
+                                locationModel = new TokenNameFinderModel(inputStream);
+                        }
+                }
+                
+                return locationModel;
+        }
+
+        private static TokenNameFinderModel getTimeModel() throws IOException {
+                synchronized (OpenNLP.class) {
+                        if (timeModel == null) {
+                                InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-time.bin");
+                                timeModel = new TokenNameFinderModel(inputStream);
+                        }
+                }
+                
+                return timeModel;
+        }
+        
+        private static List<String> classify(String str, TokenNameFinderModel model, List<String> entities) throws IOException {
+                String entity;
+                
+                NameFinderME nameFinder = new NameFinderME(model);
+                String[] tokens = tokenize(str);
+                Span nameSpans[] = nameFinder.find(tokens);
+                
+                for(Span s: nameSpans)  {
+                        if (s.getProb() < 0.60)
+                                continue ;
+                        
+                        entity = null;
+                        for (int i = s.getStart(); i < s.getEnd(); i++)
+                                if (entity == null)
+                                        entity = tokens[i];
+                                else
+                                        entity += " " + tokens[i];
+                                
+                        LOG.finest(entity + " " + s.getProb() + " " + s.toString());
+                        if (!entities.contains(entity))
+                                entities.add(entity);
+                }
+                
+                return entities;
+        } 
+
+        public static String[] tokenize(String sentence) throws IOException { 
+                synchronized (OpenNLP.class) {
+                        if (tokenModel == null) {
+                                InputStream inputStreamTokenizer = OpenNLP.class.getResourceAsStream("/en-token.bin"); 
+                                tokenModel = new TokenizerModel(inputStreamTokenizer);
+                        }
+                }
+                TokenizerME tokenizer = new TokenizerME(tokenModel);
+                return tokenizer.tokenize(sentence);
+        }
+}
index 05ddedd..5efea15 100644 (file)
@@ -35,7 +35,7 @@ public class ArticleProvider {
         private static final String CLASS_NAME = ArticleProvider.class.getName();
         private static final Logger LOG = Logger.getLogger(CLASS_NAME);
         private final Map<Category, List<Article>> articlesByCategory = new HashMap<>();
-        private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(2);
+        private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(Runtime.getRuntime().availableProcessors());
         private final Config config;
         
         public ArticleProvider(Config config) {
@@ -77,7 +77,7 @@ public class ArticleProvider {
         private static Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang) {
                 String desc, title, thumbnail, feedTitle, str;
                 Date date;
-                String[] entities;
+                List<String> entities;
                 
                 feedTitle = feed.getTitle();
                 if (feedTitle != null) {
@@ -112,15 +112,16 @@ public class ArticleProvider {
                         LOG.severe("The article " + feedTitle + " - " + title + " does not have a date");
                                      
                 
-                entities = null;
+                entities = new ArrayList<>();
                 if (desc != null && lang.equals("en"))
                         try {
-                                entities = NER.classify(desc);
+                                NER.classify(title, entities);
+                                NER.classify(desc, entities);
                         } catch (ClassCastException | ClassNotFoundException | IOException e1) {
                                 LOG.log(Level.SEVERE, "Cannot classify " + feedTitle, e1);                         
                         }
                 
-                return new Article(link, title, desc, thumbnail, date, feedTitle, entities);
+                return new Article(link, title, desc, thumbnail, date, feedTitle, entities.toArray(new String[0]));
         }
         
         private void addArticles(Category cat, SyndFeed feed) {
@@ -210,7 +211,7 @@ public class ArticleProvider {
                 
                 entities = new HashMap<>();
                 for (Article a: articles) 
-                        if (a.getEntities() != null)
+                        if (a.getEntities() != null) {
                                 for (String e: a.getEntities()) {
                                         s = entities.get(e);
                                         if (s == null) {
@@ -219,6 +220,7 @@ public class ArticleProvider {
                                         }
                                         s.increment();
                                 }                
+                        }
                 
                 stats = new ArrayList<>(entities.values());
                 stats.sort(new Comparator<EntityStat>() {
diff --git a/war/src/main/resources/en-ner-date.bin b/war/src/main/resources/en-ner-date.bin
new file mode 100644 (file)
index 0000000..a69923a
Binary files /dev/null and b/war/src/main/resources/en-ner-date.bin differ
diff --git a/war/src/main/resources/en-ner-location.bin b/war/src/main/resources/en-ner-location.bin
new file mode 100644 (file)
index 0000000..f3788bc
Binary files /dev/null and b/war/src/main/resources/en-ner-location.bin differ
diff --git a/war/src/main/resources/en-ner-organization.bin b/war/src/main/resources/en-ner-organization.bin
new file mode 100644 (file)
index 0000000..1fb6d9f
Binary files /dev/null and b/war/src/main/resources/en-ner-organization.bin differ
diff --git a/war/src/main/resources/en-ner-person.bin b/war/src/main/resources/en-ner-person.bin
new file mode 100644 (file)
index 0000000..2f68318
Binary files /dev/null and b/war/src/main/resources/en-ner-person.bin differ
diff --git a/war/src/main/resources/en-ner-time.bin b/war/src/main/resources/en-ner-time.bin
new file mode 100644 (file)
index 0000000..a5d8aa1
Binary files /dev/null and b/war/src/main/resources/en-ner-time.bin differ
diff --git a/war/src/main/resources/en-token.bin b/war/src/main/resources/en-token.bin
new file mode 100644 (file)
index 0000000..c417277
Binary files /dev/null and b/war/src/main/resources/en-token.bin differ
index c0d625f..e4d9079 100644 (file)
                 "http://feeds.macrumors.com/MacRumors-All": { "categories": ["en_technologie"]},
                 "https://www.digitaltrends.com/rss-home/": { "categories": ["en_technologie"]},
                 "http://www.zdnet.com/news/rss.xml": { "categories": ["en_technologie"]},
-                "https://www.androidheadlines.com/feed": { "categories": ["en_technologie"]}
+                "https://www.androidheadlines.com/feed": { "categories": ["en_technologie"]},
+                "https://www.nasa.gov/rss/dyn/breaking_news.rss": { "categories": ["en_technologie"]},
+                "http://www.computerweekly.com/rss/RSS-Feed.xml": { "categories": ["en_technologie"]}
         }
 }