| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.scoring.similarity.cosine; |
| |
| import java.lang.invoke.MethodHandles; |
| import java.io.BufferedReader; |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.List; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.util.StringUtils; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType; |
| import org.apache.nutch.scoring.similarity.util.LuceneTokenizer; |
| import org.apache.nutch.scoring.similarity.util.LuceneTokenizer.TokenizerType; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * This class creates a model used to store Document vector representation of the corpus. |
| * |
| */ |
| public class Model { |
| |
| //Currently only one file, but in future could accept a corpus hence an ArrayList |
| public static ArrayList<DocVector> docVectors = new ArrayList<>(); |
| private static final Logger LOG = LoggerFactory |
| .getLogger(MethodHandles.lookup().lookupClass()); |
| public static boolean isModelCreated = false; |
| private static List<String> stopWords; |
| |
| public static synchronized void createModel(Configuration conf) throws IOException { |
| if(isModelCreated) { |
| LOG.info("Model exists, skipping model creation"); |
| return; |
| } |
| LOG.info("Creating Cosine model"); |
| try { |
| //If user has specified a stopword file other than the template |
| if(!conf.get("scoring.similarity.stopword.file").equals("stopwords.txt.template")) { |
| stopWords = new ArrayList<String>(); |
| String stopWord; |
| BufferedReader br = new BufferedReader(conf.getConfResourceAsReader((conf.get("scoring.similarity.stopword.file")))); |
| while ((stopWord = br.readLine()) != null) { |
| stopWords.add(stopWord); |
| } |
| LOG.info("Loaded custom stopwords from {}",conf.get("scoring.similarity.stopword.file")); |
| } |
| |
| int[] ngramArr = retrieveNgrams(conf); |
| int mingram = ngramArr[0]; |
| int maxgram = ngramArr[1]; |
| LOG.info("Value of mingram: {} maxgram: {}", mingram, maxgram); |
| |
| // TODO : Allow for corpus of documents to be provided as gold standard. |
| String line; |
| StringBuilder sb = new StringBuilder(); |
| BufferedReader br = new BufferedReader(conf.getConfResourceAsReader((conf.get("cosine.goldstandard.file")))); |
| while ((line = br.readLine()) != null) { |
| sb.append(line); |
| } |
| DocVector goldStandard = createDocVector(sb.toString(), mingram, maxgram); |
| if(goldStandard!=null) |
| docVectors.add(goldStandard); |
| else { |
| throw new Exception("Could not create DocVector for goldstandard"); |
| } |
| } catch (Exception e) { |
| LOG.warn("Failed to add {} to model : {}",conf.get("cosine.goldstandard.file","goldstandard.txt.template"), |
| StringUtils.stringifyException(e)); |
| } |
| if(docVectors.size()>0) { |
| LOG.info("Cosine model creation complete"); |
| isModelCreated = true; |
| } |
| else |
| LOG.info("Cosine model creation failed"); |
| } |
| |
| /** |
| * Used to create a DocVector from given String text. Used during the parse stage of the crawl |
| * cycle to create a DocVector of the currently parsed page from the parseText attribute value |
| * @param content The text to tokenize |
| * @param mingram Value of mingram for tokenizing |
| * @param maxgram Value of maxgram for tokenizing |
| */ |
| public static DocVector createDocVector(String content, int mingram, int maxgram) { |
| LuceneTokenizer tokenizer; |
| |
| if(mingram > 1 && maxgram > 1){ |
| LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram); |
| tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram); |
| } else if (mingram > 1) { |
| maxgram = mingram; |
| LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram); |
| tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram); |
| } |
| else if(stopWords!=null) { |
| tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true, |
| StemFilterType.PORTERSTEM_FILTER); |
| } |
| else { |
| tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, true, |
| StemFilterType.PORTERSTEM_FILTER); |
| } |
| TokenStream tStream = tokenizer.getTokenStream(); |
| HashMap<String, Integer> termVector = new HashMap<>(); |
| try { |
| CharTermAttribute charTermAttribute = tStream.addAttribute(CharTermAttribute.class); |
| tStream.reset(); |
| while(tStream.incrementToken()) { |
| String term = charTermAttribute.toString(); |
| LOG.debug(term); |
| if(termVector.containsKey(term)) { |
| int count = termVector.get(term); |
| count++; |
| termVector.put(term, count); |
| } |
| else { |
| termVector.put(term, 1); |
| } |
| } |
| DocVector docVector = new DocVector(); |
| docVector.setTermFreqVector(termVector); |
| return docVector; |
| } catch (IOException e) { |
| LOG.error("Error creating DocVector : {}",StringUtils.stringifyException(e)); |
| } |
| return null; |
| } |
| |
| public static float computeCosineSimilarity(DocVector docVector) { |
| float scores[] = new float[docVectors.size()]; |
| int i=0; |
| float maxScore = 0; |
| for(DocVector corpusDoc : docVectors) { |
| float numerator = docVector.dotProduct(corpusDoc); |
| float denominator = docVector.getL2Norm()*corpusDoc.getL2Norm(); |
| float currentScore = numerator/denominator; |
| scores[i++] = currentScore; |
| maxScore = (currentScore>maxScore)? currentScore : maxScore; |
| } |
| // Returning the max score amongst all documents in the corpus |
| return maxScore; |
| } |
| |
| /** |
| * Retrieves mingram and maxgram from configuration |
| * @param conf Configuration to retrieve mingram and maxgram |
| * @return ngram array as mingram at first index and maxgram at second index |
| */ |
| public static int[] retrieveNgrams(Configuration conf){ |
| int[] ngramArr = new int[2]; |
| //Check if user has specified mingram or ngram for ngram cosine model |
| String[] ngramStr = conf.getStrings("scoring.similarity.ngrams", "1,1"); |
| //mingram |
| ngramArr[0] = Integer.parseInt(ngramStr[0]); |
| if (ngramStr.length > 1) { |
| //maxgram |
| ngramArr[1] = Integer.parseInt(ngramStr[1]); |
| } else { |
| //maxgram |
| ngramArr[1] = ngramArr[0]; |
| } |
| return ngramArr; |
| } |
| } |