<version>3.8.0</version>
<classifier>models</classifier>
</dependency>
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-tools</artifactId>
+ <version>1.8.1</version>
+ </dependency>
</dependencies>
</project>
public String[] getEntities() {
return entities;
}
+
+ public Date getPublicationDate() {
+ return publicationDate;
+ }
}
private static final String CLASS_NAME = NER.class.getName();
private static final Logger LOG = Logger.getLogger(CLASS_NAME);
- public static String[] classify(String str) throws ClassCastException, ClassNotFoundException, IOException {
+ public static List<String> classify(String str, List<String> entities) throws ClassCastException, ClassNotFoundException, IOException {
CRFClassifier<CoreLabel> classifier;
List<List<CoreLabel>> out;
String cat, w;
- List<String> entities;
final String FUNCTION_NAME = "classify";
LOG.entering(CLASS_NAME, FUNCTION_NAME, str);
+
+ OpenNLP.classify(str, entities);
classifier = CRFClassifier.getDefaultClassifier();
out = classifier.classify(str);
- entities = new ArrayList<>();
for (List<CoreLabel> labels: out)
for (CoreLabel l: labels) {
cat = l.getString(AnswerAnnotation.class);
entities.add(w);
}
+ entities.remove("CNET");
+ entities.remove("Read More");
+ entities.remove("New");
+ entities.remove("App");
+
LOG.exiting(CLASS_NAME, FUNCTION_NAME, entities);
- return entities.toArray(new String[0]);
+ return entities;
}
public static void main(String[] args) throws Exception {
- classify("I live in Washington.");
+ classify("I live in Washington.", new ArrayList<>());
}
}
\ No newline at end of file
--- /dev/null
+package pnews;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+import java.util.logging.Logger;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.Span;
+
+/** http://www.devglan.com/artificial-intelligence/opennlp-named-entity-recognition-example **/
+public class OpenNLP {
+ private static final String CLASS_NAME = OpenNLP.class.getName();
+ private static final Logger LOG = Logger.getLogger(CLASS_NAME);
+ private static TokenNameFinderModel organizationModel;
+ private static TokenNameFinderModel personModel;
+ private static TokenNameFinderModel locationModel;
+ private static TokenNameFinderModel timeModel;
+ private static TokenizerModel tokenModel;
+
+ public static List<String> classify(String str, List<String> entities) throws IOException {
+ classify(str, getOrganizationModel(), entities);
+
+ classify(str, getPersonModel(), entities);
+ classify(str, getLocationModel(), entities);
+
+ classify(str, getTimeModel(), entities);
+
+ return entities;
+ }
+
+ private static TokenNameFinderModel getOrganizationModel() throws IOException {
+ synchronized (OpenNLP.class) {
+ if (organizationModel == null) {
+ InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-organization.bin");
+ organizationModel = new TokenNameFinderModel(inputStream);
+ }
+ }
+
+ return organizationModel;
+ }
+
+ private static TokenNameFinderModel getPersonModel() throws IOException {
+ synchronized (OpenNLP.class) {
+ if (personModel == null) {
+ InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-person.bin");
+ personModel = new TokenNameFinderModel(inputStream);
+ }
+ }
+
+ return personModel;
+ }
+
+ private static TokenNameFinderModel getLocationModel() throws IOException {
+ synchronized (OpenNLP.class) {
+ if (locationModel == null) {
+ InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-location.bin");
+ locationModel = new TokenNameFinderModel(inputStream);
+ }
+ }
+
+ return locationModel;
+ }
+
+ private static TokenNameFinderModel getTimeModel() throws IOException {
+ synchronized (OpenNLP.class) {
+ if (timeModel == null) {
+ InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-time.bin");
+ timeModel = new TokenNameFinderModel(inputStream);
+ }
+ }
+
+ return timeModel;
+ }
+
+ private static List<String> classify(String str, TokenNameFinderModel model, List<String> entities) throws IOException {
+ String entity;
+
+ NameFinderME nameFinder = new NameFinderME(model);
+ String[] tokens = tokenize(str);
+ Span nameSpans[] = nameFinder.find(tokens);
+
+ for(Span s: nameSpans) {
+ if (s.getProb() < 0.60)
+ continue ;
+
+ entity = null;
+ for (int i = s.getStart(); i < s.getEnd(); i++)
+ if (entity == null)
+ entity = tokens[i];
+ else
+ entity += " " + tokens[i];
+
+ LOG.finest(entity + " " + s.getProb() + " " + s.toString());
+ if (!entities.contains(entity))
+ entities.add(entity);
+ }
+
+ return entities;
+ }
+
+ public static String[] tokenize(String sentence) throws IOException {
+ synchronized (OpenNLP.class) {
+ if (tokenModel == null) {
+ InputStream inputStreamTokenizer = OpenNLP.class.getResourceAsStream("/en-token.bin");
+ tokenModel = new TokenizerModel(inputStreamTokenizer);
+ }
+ }
+ TokenizerME tokenizer = new TokenizerME(tokenModel);
+ return tokenizer.tokenize(sentence);
+ }
+}
private static final String CLASS_NAME = ArticleProvider.class.getName();
private static final Logger LOG = Logger.getLogger(CLASS_NAME);
private final Map<Category, List<Article>> articlesByCategory = new HashMap<>();
- private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(2);
+ private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(Runtime.getRuntime().availableProcessors());
private final Config config;
public ArticleProvider(Config config) {
private static Article toArticle(String link, SyndEntry entry, SyndFeed feed, String lang) {
String desc, title, thumbnail, feedTitle, str;
Date date;
- String[] entities;
+ List<String> entities;
feedTitle = feed.getTitle();
if (feedTitle != null) {
LOG.severe("The article " + feedTitle + " - " + title + " does not have a date");
- entities = null;
+ entities = new ArrayList<>();
if (desc != null && lang.equals("en"))
try {
- entities = NER.classify(desc);
+ NER.classify(title, entities);
+ NER.classify(desc, entities);
} catch (ClassCastException | ClassNotFoundException | IOException e1) {
LOG.log(Level.SEVERE, "Cannot classify " + feedTitle, e1);
}
- return new Article(link, title, desc, thumbnail, date, feedTitle, entities);
+ return new Article(link, title, desc, thumbnail, date, feedTitle, entities.toArray(new String[0]));
}
private void addArticles(Category cat, SyndFeed feed) {
entities = new HashMap<>();
for (Article a: articles)
- if (a.getEntities() != null)
+ if (a.getEntities() != null) {
for (String e: a.getEntities()) {
s = entities.get(e);
if (s == null) {
}
s.increment();
}
+ }
stats = new ArrayList<>(entities.values());
stats.sort(new Comparator<EntityStat>() {
"http://feeds.macrumors.com/MacRumors-All": { "categories": ["en_technologie"]},
"https://www.digitaltrends.com/rss-home/": { "categories": ["en_technologie"]},
"http://www.zdnet.com/news/rss.xml": { "categories": ["en_technologie"]},
- "https://www.androidheadlines.com/feed": { "categories": ["en_technologie"]}
+ "https://www.androidheadlines.com/feed": { "categories": ["en_technologie"]},
+ "https://www.nasa.gov/rss/dyn/breaking_news.rss": { "categories": ["en_technologie"]},
+ "http://www.computerweekly.com/rss/RSS-Feed.xml": { "categories": ["en_technologie"]}
}
}