| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.solr.util; |
| |
| import java.io.ByteArrayInputStream; |
| import java.io.File; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.net.MalformedURLException; |
| import java.net.URL; |
| import java.nio.ByteBuffer; |
| import java.nio.charset.StandardCharsets; |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Set; |
| |
| import org.apache.solr.SolrTestCaseJ4; |
| import org.apache.solr.util.SimplePostTool.PageFetcher; |
| import org.apache.solr.util.SimplePostTool.PageFetcherResult; |
| import org.junit.Before; |
| import org.junit.Test; |
| |
| /** |
| * NOTE: do *not* use real hostnames, not even "example.com", in this test. |
| * |
| * A MockPageFetcher is used to prevent real HTTP requests from being executed. |
| */ |
| public class SimplePostToolTest extends SolrTestCaseJ4 { |
| |
| SimplePostTool t_file, t_file_auto, t_file_rec, t_web, t_test; |
| PageFetcher pf; |
| |
| @Before |
| public void initVariousPostTools() throws Exception { |
| String[] args = {"-"}; |
| |
| // Add a dummy core/collection property so that the SimplePostTool |
| // doesn't fail fast. |
| System.setProperty("c", "testcollection"); |
| |
| System.setProperty("data", "files"); |
| t_file = SimplePostTool.parseArgsAndInit(args); |
| |
| System.setProperty("auto", "yes"); |
| t_file_auto = SimplePostTool.parseArgsAndInit(args); |
| |
| System.setProperty("recursive", "yes"); |
| t_file_rec = SimplePostTool.parseArgsAndInit(args); |
| |
| System.setProperty("data", "web"); |
| t_web = SimplePostTool.parseArgsAndInit(args); |
| |
| System.setProperty("params", "param1=foo¶m2=bar"); |
| System.setProperty("url", "http://user:password@localhost:5150/solr/update"); |
| t_test = SimplePostTool.parseArgsAndInit(args); |
| |
| pf = new MockPageFetcher(); |
| for (SimplePostTool mockable : new SimplePostTool[]{t_web, t_file_auto}) { |
| mockable.pageFetcher = pf; |
| mockable.mockMode = true; |
| } |
| } |
| |
| @Test |
| public void testParseArgsAndInit() { |
| assertEquals(false, t_file.auto); |
| assertEquals(true, t_file_auto.auto); |
| assertEquals(0, t_file_auto.recursive); |
| assertEquals(999, t_file_rec.recursive); |
| assertEquals(true, t_file.commit); |
| assertEquals(false, t_file.optimize); |
| assertEquals(null, t_file.out); |
| |
| assertEquals(1, t_web.recursive); |
| assertEquals(10, t_web.delay); |
| |
| assertEquals("http://user:password@localhost:5150/solr/update?param1=foo¶m2=bar",t_test.solrUrl.toExternalForm()); |
| } |
| |
| @Test |
| public void testNormalizeUrlEnding() { |
| assertEquals("http://[ff01::114]", SimplePostTool.normalizeUrlEnding("http://[ff01::114]/")); |
| assertEquals("http://[ff01::114]", SimplePostTool.normalizeUrlEnding("http://[ff01::114]/#foo?bar=baz")); |
| assertEquals("http://[ff01::114]/index.html", SimplePostTool.normalizeUrlEnding("http://[ff01::114]/index.html#hello")); |
| } |
| |
| @Test |
| public void testComputeFullUrl() throws MalformedURLException { |
| assertEquals("http://[ff01::114]/index.html", t_web.computeFullUrl(new URL("http://[ff01::114]/"), "/index.html")); |
| assertEquals("http://[ff01::114]/index.html", t_web.computeFullUrl(new URL("http://[ff01::114]/foo/bar/"), "/index.html")); |
| assertEquals("http://[ff01::114]/fil.html", t_web.computeFullUrl(new URL("http://[ff01::114]/foo.htm?baz#hello"), "fil.html")); |
| // TODO: How to know what is the base if URL path ends with "foo"?? |
| // assertEquals("http://[ff01::114]/fil.html", t_web.computeFullUrl(new URL("http://[ff01::114]/foo?baz#hello"), "fil.html")); |
| assertEquals(null, t_web.computeFullUrl(new URL("http://[ff01::114]/"), "fil.jpg")); |
| assertEquals(null, t_web.computeFullUrl(new URL("http://[ff01::114]/"), "mailto:hello@foo.bar")); |
| assertEquals(null, t_web.computeFullUrl(new URL("http://[ff01::114]/"), "ftp://server/file")); |
| } |
| |
| @Test |
| public void testTypeSupported() { |
| assertTrue(t_web.typeSupported("application/pdf")); |
| assertTrue(t_web.typeSupported("application/xml")); |
| assertFalse(t_web.typeSupported("text/foo")); |
| |
| t_web.fileTypes = "doc,xls,ppt"; |
| t_web.fileFilter = t_web.getFileFilterFromFileTypes(t_web.fileTypes); |
| assertFalse(t_web.typeSupported("application/pdf")); |
| assertTrue(t_web.typeSupported("application/msword")); |
| } |
| |
| @Test |
| public void testIsOn() { |
| assertTrue(SimplePostTool.isOn("true")); |
| assertTrue(SimplePostTool.isOn("1")); |
| assertFalse(SimplePostTool.isOn("off")); |
| } |
| |
| @Test |
| public void testAppendParam() { |
| assertEquals("http://[ff01::114]?foo=bar", SimplePostTool.appendParam("http://[ff01::114]", "foo=bar")); |
| assertEquals("http://[ff01::114]/?a=b&foo=bar", SimplePostTool.appendParam("http://[ff01::114]/?a=b", "foo=bar")); |
| } |
| |
| @Test |
| public void testAppendUrlPath() throws MalformedURLException { |
| assertEquals(new URL("http://[ff01::114]/a?foo=bar"), SimplePostTool.appendUrlPath(new URL("http://[ff01::114]?foo=bar"), "/a")); |
| } |
| |
| @Test |
| public void testGuessType() { |
| File f = new File("foo.doc"); |
| assertEquals("application/msword", SimplePostTool.guessType(f)); |
| f = new File("foobar"); |
| assertEquals("application/octet-stream", SimplePostTool.guessType(f)); |
| f = new File("foo.jsonl"); |
| assertEquals("application/json", SimplePostTool.guessType(f)); |
| } |
| |
| @Test |
| public void testDoFilesMode() { |
| t_file_auto.recursive = 0; |
| File dir = getFile("exampledocs"); |
| int num = t_file_auto.postFiles(new File[] {dir}, 0, null, null); |
| assertEquals(2, num); |
| } |
| |
| @Test |
| public void testDoWebMode() { |
| // Uses mock pageFetcher |
| t_web.delay = 0; |
| t_web.recursive = 5; |
| int num = t_web.postWebPages(new String[] {"http://[ff01::114]/#removeme"}, 0, null); |
| assertEquals(5, num); |
| |
| t_web.recursive = 1; |
| num = t_web.postWebPages(new String[] {"http://[ff01::114]/"}, 0, null); |
| assertEquals(3, num); |
| |
| // Without respecting robots.txt |
| t_web.pageFetcher.robotsCache.put("[ff01::114]", Collections.emptyList()); |
| t_web.recursive = 5; |
| num = t_web.postWebPages(new String[] {"http://[ff01::114]/#removeme"}, 0, null); |
| assertEquals(6, num); |
| } |
| |
| @Test |
| public void testRobotsExclusion() throws MalformedURLException { |
| assertFalse(t_web.pageFetcher.isDisallowedByRobots(new URL("http://[ff01::114]/"))); |
| assertTrue(t_web.pageFetcher.isDisallowedByRobots(new URL("http://[ff01::114]/disallowed"))); |
| assertTrue("There should be two entries parsed from robots.txt", t_web.pageFetcher.robotsCache.get("[ff01::114]").size() == 2); |
| } |
| |
| static class MockPageFetcher extends PageFetcher { |
| HashMap<String,String> htmlMap = new HashMap<>(); |
| HashMap<String,Set<URL>> linkMap = new HashMap<>(); |
| |
| public MockPageFetcher() throws IOException { |
| (new SimplePostTool()).super(); |
| htmlMap.put("http://[ff01::114]", "<html><body><a href=\"http://[ff01::114]/page1\">page1</a><a href=\"http://[ff01::114]/page2\">page2</a></body></html>"); |
| htmlMap.put("http://[ff01::114]/index.html", "<html><body><a href=\"http://[ff01::114]/page1\">page1</a><a href=\"http://[ff01::114]/page2\">page2</a></body></html>"); |
| htmlMap.put("http://[ff01::114]/page1", "<html><body><a href=\"http://[ff01::114]/page1/foo\"></body></html>"); |
| htmlMap.put("http://[ff01::114]/page1/foo", "<html><body><a href=\"http://[ff01::114]/page1/foo/bar\"></body></html>"); |
| htmlMap.put("http://[ff01::114]/page1/foo/bar", "<html><body><a href=\"http://[ff01::114]/page1\"></body></html>"); |
| htmlMap.put("http://[ff01::114]/page2", "<html><body><a href=\"http://[ff01::114]/\"><a href=\"http://[ff01::114]/disallowed\"/></body></html>"); |
| htmlMap.put("http://[ff01::114]/disallowed", "<html><body><a href=\"http://[ff01::114]/\"></body></html>"); |
| |
| Set<URL> s = new HashSet<>(); |
| s.add(new URL("http://[ff01::114]/page1")); |
| s.add(new URL("http://[ff01::114]/page2")); |
| linkMap.put("http://[ff01::114]", s); |
| linkMap.put("http://[ff01::114]/index.html", s); |
| s = new HashSet<>(); |
| s.add(new URL("http://[ff01::114]/page1/foo")); |
| linkMap.put("http://[ff01::114]/page1", s); |
| s = new HashSet<>(); |
| s.add(new URL("http://[ff01::114]/page1/foo/bar")); |
| linkMap.put("http://[ff01::114]/page1/foo", s); |
| s = new HashSet<>(); |
| s.add(new URL("http://[ff01::114]/disallowed")); |
| linkMap.put("http://[ff01::114]/page2", s); |
| |
| // Simulate a robots.txt file with comments and a few disallows |
| StringBuilder sb = new StringBuilder(); |
| sb.append("# Comments appear after the \"#\" symbol at the start of a line, or after a directive\n"); |
| sb.append("User-agent: * # match all bots\n"); |
| sb.append("Disallow: # This is void\n"); |
| sb.append("Disallow: /disallow # Disallow this path\n"); |
| sb.append("Disallow: /nonexistingpath # Disallow this path\n"); |
| this.robotsCache.put("[ff01::114]", super. |
| parseRobotsTxt(new ByteArrayInputStream(sb.toString().getBytes(StandardCharsets.UTF_8)))); |
| } |
| |
| @Override |
| public PageFetcherResult readPageFromUrl(URL u) { |
| PageFetcherResult res = new PageFetcherResult(); |
| if (isDisallowedByRobots(u)) { |
| res.httpStatus = 403; |
| return res; |
| } |
| res.httpStatus = 200; |
| res.contentType = "text/html"; |
| res.content = ByteBuffer.wrap( htmlMap.get(u.toString()).getBytes(StandardCharsets.UTF_8)); |
| return res; |
| } |
| |
| @Override |
| public Set<URL> getLinksFromWebPage(URL u, InputStream is, String type, URL postUrl) { |
| Set<URL> s = linkMap.get(SimplePostTool.normalizeUrlEnding(u.toString())); |
| if(s == null) |
| s = new HashSet<>(); |
| return s; |
| } |
| } |
| } |