| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.classification.document; |
| |
| import java.io.IOException; |
| import java.util.LinkedHashMap; |
| import java.util.Map; |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.en.EnglishAnalyzer; |
| import org.apache.lucene.classification.ClassificationResult; |
| import org.apache.lucene.classification.ClassificationTestBase; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.IndexWriterConfig; |
| import org.apache.lucene.index.RandomIndexWriter; |
| import org.apache.lucene.util.BytesRef; |
| import org.junit.Before; |
| |
| /** Base class for testing {@link org.apache.lucene.classification.Classifier}s */ |
| public abstract class DocumentClassificationTestBase<T> extends ClassificationTestBase<T> { |
| |
| protected static final BytesRef VIDEOGAME_RESULT = new BytesRef("videogames"); |
| protected static final BytesRef VIDEOGAME_ANALYZED_RESULT = new BytesRef("videogam"); |
| protected static final BytesRef BATMAN_RESULT = new BytesRef("batman"); |
| |
| protected String titleFieldName = "title"; |
| protected String authorFieldName = "author"; |
| |
| protected Analyzer analyzer; |
| protected Map<String, Analyzer> field2analyzer; |
| protected IndexReader indexReader; |
| |
| @Before |
| public void init() throws IOException { |
| analyzer = new EnglishAnalyzer(); |
| field2analyzer = new LinkedHashMap<>(); |
| field2analyzer.put(textFieldName, analyzer); |
| field2analyzer.put(titleFieldName, analyzer); |
| field2analyzer.put(authorFieldName, analyzer); |
| indexReader = populateDocumentClassificationIndex(analyzer); |
| } |
| |
| protected double checkCorrectDocumentClassification( |
| DocumentClassifier<T> classifier, Document inputDoc, T expectedResult) throws Exception { |
| ClassificationResult<T> classificationResult = classifier.assignClass(inputDoc); |
| assertNotNull(classificationResult.getAssignedClass()); |
| assertEquals( |
| "got an assigned class of " + classificationResult.getAssignedClass(), |
| expectedResult, |
| classificationResult.getAssignedClass()); |
| double score = classificationResult.getScore(); |
| assertTrue("score should be between 0 and 1, got:" + score, score <= 1 && score >= 0); |
| return score; |
| } |
| |
| protected IndexReader populateDocumentClassificationIndex(Analyzer analyzer) throws IOException { |
| indexWriter.close(); |
| indexWriter = |
| new RandomIndexWriter( |
| random(), |
| dir, |
| newIndexWriterConfig(analyzer).setOpenMode(IndexWriterConfig.OpenMode.CREATE)); |
| indexWriter.commit(); |
| String text; |
| String title; |
| String author; |
| |
| Document doc = new Document(); |
| title = "Video games are an economic business"; |
| text = |
| "Video games have become an art form and an industry. The video game industry is of increasing" |
| + " commercial importance, with growth driven particularly by the emerging Asian markets and mobile games." |
| + " As of 2015, video games generated sales of USD 74 billion annually worldwide, and were the third-largest" |
| + " segment in the U.S. entertainment market, behind broadcast and cable TV."; |
| author = "Ign"; |
| doc.add(new Field(textFieldName, text, ft)); |
| doc.add(new Field(titleFieldName, title, ft)); |
| doc.add(new Field(authorFieldName, author, ft)); |
| doc.add(new Field(categoryFieldName, "videogames", ft)); |
| doc.add(new Field(booleanFieldName, "false", ft)); |
| indexWriter.addDocument(doc); |
| |
| doc = new Document(); |
| title = "Video games: the definition of fun on PC and consoles"; |
| text = |
| "A video game is an electronic game that involves human interaction with a user interface to generate" |
| + " visual feedback on a video device. The word video in video game traditionally referred to a raster display device," |
| + "[1] but it now implies any type of display device that can produce two- or three-dimensional images." |
| + " The electronic systems used to play video games are known as platforms; examples of these are personal" |
| + " computers and video game consoles. These platforms range from large mainframe computers to small handheld devices." |
| + " Specialized video games such as arcade games, while previously common, have gradually declined in use."; |
| author = "Ign"; |
| doc.add(new Field(textFieldName, text, ft)); |
| doc.add(new Field(titleFieldName, title, ft)); |
| doc.add(new Field(authorFieldName, author, ft)); |
| doc.add(new Field(categoryFieldName, "videogames", ft)); |
| doc.add(new Field(booleanFieldName, "false", ft)); |
| indexWriter.addDocument(doc); |
| |
| doc = new Document(); |
| title = "Video games: the history across PC, consoles and fun"; |
| text = |
| "Early games used interactive electronic devices with various display formats. The earliest example is" |
| + " from 1947—a device was filed for a patent on 25 January 1947, by Thomas T. Goldsmith Jr. and Estle Ray Mann," |
| + " and issued on 14 December 1948, as U.S. Patent 2455992.[2]" |
| + "Inspired by radar display tech, it consisted of an analog device that allowed a user to control a vector-drawn" |
| + " dot on the screen to simulate a missile being fired at targets, which were drawings fixed to the screen.[3]"; |
| author = "Ign"; |
| doc.add(new Field(textFieldName, text, ft)); |
| doc.add(new Field(titleFieldName, title, ft)); |
| doc.add(new Field(authorFieldName, author, ft)); |
| doc.add(new Field(categoryFieldName, "videogames", ft)); |
| doc.add(new Field(booleanFieldName, "false", ft)); |
| indexWriter.addDocument(doc); |
| |
| doc = new Document(); |
| title = "Video games: the history"; |
| text = |
| "Early games used interactive electronic devices with various display formats. The earliest example is" |
| + " from 1947—a device was filed for a patent on 25 January 1947, by Thomas T. Goldsmith Jr. and Estle Ray Mann," |
| + " and issued on 14 December 1948, as U.S. Patent 2455992.[2]" |
| + "Inspired by radar display tech, it consisted of an analog device that allowed a user to control a vector-drawn" |
| + " dot on the screen to simulate a missile being fired at targets, which were drawings fixed to the screen.[3]"; |
| author = "Ign"; |
| doc.add(new Field(textFieldName, text, ft)); |
| doc.add(new Field(titleFieldName, title, ft)); |
| doc.add(new Field(authorFieldName, author, ft)); |
| doc.add(new Field(categoryFieldName, "videogames", ft)); |
| doc.add(new Field(booleanFieldName, "false", ft)); |
| indexWriter.addDocument(doc); |
| |
| doc = new Document(); |
| title = "Batman: Arkham Knight PC Benchmarks, For What They're Worth"; |
| text = |
| "Although I didn’t spend much time playing Batman: Arkham Origins, I remember the game rather well after" |
| + " testing it on no less than 30 graphics cards and 20 CPUs. Arkham Origins appeared to take full advantage of" |
| + " Unreal Engine 3, it ran smoothly on affordable GPUs, though it’s worth remembering that Origins was developed " |
| + "for last-gen consoles.This week marked the arrival of Batman: Arkham Knight, the fourth entry in WB’s Batman:" |
| + " Arkham series and a direct sequel to 2013’s Arkham Origins 2011’s Arkham City." |
| + "Arkham Knight is also powered by Unreal Engine 3, but you can expect noticeably improved graphics, in part because" |
| + " the PlayStation 4 and Xbox One have replaced the PS3 and 360 as the lowest common denominator."; |
| author = "Rocksteady Studios"; |
| doc.add(new Field(textFieldName, text, ft)); |
| doc.add(new Field(titleFieldName, title, ft)); |
| doc.add(new Field(authorFieldName, author, ft)); |
| doc.add(new Field(categoryFieldName, "batman", ft)); |
| doc.add(new Field(booleanFieldName, "false", ft)); |
| indexWriter.addDocument(doc); |
| |
| doc = new Document(); |
| title = "Face-Off: Batman: Arkham Knight, the Dark Knight returns!"; |
| text = |
| "Despite the drama surrounding the PC release leading to its subsequent withdrawal, there's a sense of success" |
| + " in the console space as PlayStation 4 owners, and indeed those on Xbox One, get a superb rendition of Batman:" |
| + " Arkham Knight. It's fair to say Rocksteady sized up each console's strengths well ahead of producing its first" |
| + " current-gen title, and it's paid off in one of the best Batman games we've seen in years."; |
| author = "Rocksteady Studios"; |
| doc.add(new Field(textFieldName, text, ft)); |
| doc.add(new Field(titleFieldName, title, ft)); |
| doc.add(new Field(authorFieldName, author, ft)); |
| doc.add(new Field(categoryFieldName, "batman", ft)); |
| doc.add(new Field(booleanFieldName, "false", ft)); |
| indexWriter.addDocument(doc); |
| |
| doc = new Document(); |
| title = "Batman: Arkham Knight Having More Trouble, But This Time not in Gotham"; |
| text = |
| "As news began to break about the numerous issues affecting the PC version of Batman: Arkham Knight, players" |
| + " of the console version breathed a sigh of relief and got back to playing the game. Now players of the PlayStation" |
| + " 4 version are having problems of their own, albeit much less severe ones." |
| + "This time Batman will have a difficult time in Gotham."; |
| author = "Rocksteady Studios"; |
| doc.add(new Field(textFieldName, text, ft)); |
| doc.add(new Field(titleFieldName, title, ft)); |
| doc.add(new Field(authorFieldName, author, ft)); |
| doc.add(new Field(categoryFieldName, "batman", ft)); |
| doc.add(new Field(booleanFieldName, "false", ft)); |
| indexWriter.addDocument(doc); |
| |
| doc = new Document(); |
| title = "Batman: Arkham Knight the new legend of Gotham"; |
| text = |
| "As news began to break about the numerous issues affecting the PC version of the game, players" |
| + " of the console version breathed a sigh of relief and got back to play. Now players of the PlayStation" |
| + " 4 version are having problems of their own, albeit much less severe ones."; |
| author = "Rocksteady Studios"; |
| doc.add(new Field(textFieldName, text, ft)); |
| doc.add(new Field(titleFieldName, title, ft)); |
| doc.add(new Field(authorFieldName, author, ft)); |
| doc.add(new Field(categoryFieldName, "batman", ft)); |
| doc.add(new Field(booleanFieldName, "false", ft)); |
| indexWriter.addDocument(doc); |
| |
| doc = new Document(); |
| text = "unlabeled doc"; |
| doc.add(new Field(textFieldName, text, ft)); |
| indexWriter.addDocument(doc); |
| |
| indexWriter.commit(); |
| return indexWriter.getReader(); |
| } |
| |
| protected Document getVideoGameDocument() { |
| Document doc = new Document(); |
| String title = "The new generation of PC and Console Video games"; |
| String text = |
| "Recently a lot of games have been released for the latest generations of consoles and personal computers." |
| + "One of them is Batman: Arkham Knight released recently on PS4, X-box and personal computer." |
| + "Another important video game that will be released in November is Assassin's Creed, a classic series that sees its new installement on Halloween." |
| + "Recently a lot of problems affected the Assassin's creed series but this time it should ran smoothly on affordable GPUs." |
| + "Players are waiting for the versions of their favourite video games and so do we."; |
| String author = "Ign"; |
| doc.add(new Field(textFieldName, text, ft)); |
| doc.add(new Field(titleFieldName, title, ft)); |
| doc.add(new Field(authorFieldName, author, ft)); |
| doc.add(new Field(booleanFieldName, "false", ft)); |
| return doc; |
| } |
| |
| protected Document getBatmanDocument() { |
| Document doc = new Document(); |
| String title = |
| "Batman: Arkham Knight new adventures for the super hero across Gotham, the Dark Knight has returned!"; |
| String title2 = "I am a second title !"; |
| String text = |
| "This game is the electronic version of the famous super hero adventures.It involves the interaction with the open world" |
| + " of the city of Gotham. Finally the player will be able to have fun on its personal device." |
| + " The three-dimensional images of the game are stunning, because it uses the Unreal Engine 3." |
| + " The systems available are PS4, X-Box and personal computer." |
| + " Will the simulate missile that is going to be fired, success ?\" +\n" |
| + " Will this video game make the history" |
| + " Help you favourite super hero to defeat all his enemies. The Dark Knight has returned !"; |
| String author = "Rocksteady Studios"; |
| doc.add(new Field(textFieldName, text, ft)); |
| doc.add(new Field(titleFieldName, title, ft)); |
| doc.add(new Field(titleFieldName, title2, ft)); |
| doc.add(new Field(authorFieldName, author, ft)); |
| doc.add(new Field(booleanFieldName, "false", ft)); |
| return doc; |
| } |
| |
| protected Document getBatmanAmbiguosDocument() { |
| Document doc = new Document(); |
| String title = |
| "Batman: Arkham Knight new adventures for the super hero across Gotham, the Dark Knight has returned! Batman will win !"; |
| String text = |
| "Early games used interactive electronic devices with various display formats. The earliest example is" |
| + " from 1947—a device was filed for a patent on 25 January 1947, by Thomas T. Goldsmith Jr. and Estle Ray Mann," |
| + " and issued on 14 December 1948, as U.S. Patent 2455992.[2]" |
| + "Inspired by radar display tech, it consisted of an analog device that allowed a user to control a vector-drawn" |
| + " dot on the screen to simulate a missile being fired at targets, which were drawings fixed to the screen.[3]"; |
| String author = "Ign"; |
| doc.add(new Field(textFieldName, text, ft)); |
| doc.add(new Field(titleFieldName, title, ft)); |
| doc.add(new Field(authorFieldName, author, ft)); |
| doc.add(new Field(booleanFieldName, "false", ft)); |
| return doc; |
| } |
| } |