X-Git-Url: http://git.wpitchoune.net/gitweb/?p=pnews.git;a=blobdiff_plain;f=war%2Fsrc%2Fmain%2Fjava%2Fnet%2Fwpitchoune%2Fpnews%2Fclassifier%2FOpenNLP.java;fp=war%2Fsrc%2Fmain%2Fjava%2Fnet%2Fwpitchoune%2Fpnews%2Fclassifier%2FOpenNLP.java;h=ff9707d4bc161e0c65aaa3c216f991b7d0ca975c;hp=0000000000000000000000000000000000000000;hb=aff83c8798602b535d13edeaffdb8f4238e2bbf5;hpb=88a7ba9745b8318ca6c4f741906a40e3d6a8f07e diff --git a/war/src/main/java/net/wpitchoune/pnews/classifier/OpenNLP.java b/war/src/main/java/net/wpitchoune/pnews/classifier/OpenNLP.java new file mode 100644 index 0000000..ff9707d --- /dev/null +++ b/war/src/main/java/net/wpitchoune/pnews/classifier/OpenNLP.java @@ -0,0 +1,101 @@ +package net.wpitchoune.pnews.classifier; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; +import java.util.logging.Logger; + +import net.wpitchoune.pnews.Config; +import opennlp.tools.namefind.NameFinderME; +import opennlp.tools.namefind.TokenNameFinderModel; +import opennlp.tools.tokenize.TokenizerME; +import opennlp.tools.tokenize.TokenizerModel; +import opennlp.tools.util.Span; + +/** http://www.devglan.com/artificial-intelligence/opennlp-named-entity-recognition-example **/ +public class OpenNLP { + private static final String CLASS_NAME = OpenNLP.class.getName(); + private static final Logger LOG = Logger.getLogger(CLASS_NAME); + private static TokenNameFinderModel organizationModel; + private static TokenNameFinderModel personModel; + private static TokenNameFinderModel locationModel; + private static TokenizerModel tokenModel; + + public static List classify(String str, List entities, Config config) throws IOException { + classify(str, getOrganizationModel(), entities, config); + classify(str, getPersonModel(), entities, config); + classify(str, getLocationModel(), entities, config); + + return entities; + } + + private static TokenNameFinderModel getOrganizationModel() throws IOException { + synchronized (OpenNLP.class) { + if (organizationModel == null) { + InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-organization.bin"); + organizationModel = new TokenNameFinderModel(inputStream); + } + } + + return organizationModel; + } + + private static TokenNameFinderModel getPersonModel() throws IOException { + synchronized (OpenNLP.class) { + if (personModel == null) { + InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-person.bin"); + personModel = new TokenNameFinderModel(inputStream); + } + } + + return personModel; + } + + private static TokenNameFinderModel getLocationModel() throws IOException { + synchronized (OpenNLP.class) { + if (locationModel == null) { + InputStream inputStream = OpenNLP.class.getResourceAsStream("/en-ner-location.bin"); + locationModel = new TokenNameFinderModel(inputStream); + } + } + + return locationModel; + } + + private static List classify(String str, TokenNameFinderModel model, List entities, Config config) throws IOException { + String entity; + + NameFinderME nameFinder = new NameFinderME(model); + String[] tokens = tokenize(str); + Span nameSpans[] = nameFinder.find(tokens); + + for(Span s: nameSpans) { + if (s.getProb() < 0.60) + continue ; + + entity = null; + for (int i = s.getStart(); i < s.getEnd(); i++) + if (entity == null) + entity = tokens[i]; + else + entity += " " + tokens[i]; + + LOG.finest(entity + " " + s.getProb() + " " + s.toString()); + if (!config.isBlacklistedEntity(entity) && !entities.contains(entity)) + entities.add(config.getEntityAlias(entity)); + } + + return entities; + } + + private static String[] tokenize(String sentence) throws IOException { + synchronized (OpenNLP.class) { + if (tokenModel == null) { + InputStream inputStreamTokenizer = OpenNLP.class.getResourceAsStream("/en-token.bin"); + tokenModel = new TokenizerModel(inputStreamTokenizer); + } + } + TokenizerME tokenizer = new TokenizerME(tokenModel); + return tokenizer.tokenize(sentence); + } +}