| Index: lucene/benchmark/build.xml
|
| ===================================================================
|
| --- lucene/benchmark/build.xml (revision 1417689)
|
| +++ lucene/benchmark/build.xml (working copy)
|
| @@ -152,7 +152,7 @@
|
| <fileset dir="lib"> |
| <include name="commons-compress-1.4.1.jar"/> |
| <include name="xercesImpl-2.9.1.jar"/> |
| - <include name="nekohtml-1.9.15.jar"/> |
| + <include name="nekohtml-1.9.17.jar"/> |
| </fileset> |
| </path> |
| <path id="run.classpath"> |
| Index: lucene/benchmark/ivy.xml
|
| ===================================================================
|
| --- lucene/benchmark/ivy.xml (revision 1417689)
|
| +++ lucene/benchmark/ivy.xml (working copy)
|
| @@ -21,7 +21,7 @@
|
| <dependencies> |
| <dependency org="org.apache.commons" name="commons-compress" rev="1.4.1" transitive="false"/> |
| <dependency org="xerces" name="xercesImpl" rev="2.9.1" transitive="false"/> |
| - <dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.15" transitive="false"/> |
| + <dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.17" transitive="false"/> |
| <exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/> |
| </dependencies> |
| </ivy-module> |
| Index: lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java
|
| ===================================================================
|
| --- lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java (revision 1417689)
|
| +++ lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java (working copy)
|
| @@ -20,6 +20,7 @@
|
| import java.io.IOException; |
| import java.io.Reader; |
| import java.io.StringReader; |
| +import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.Date; |
| import java.util.HashSet; |
| @@ -65,10 +66,10 @@
|
| @Override |
| public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { |
| if (inHEAD > 0) { |
| - if (equalsIgnoreTurkish("title", localName)) { |
| + if ("title".equals(localName)) { |
| inTITLE++; |
| } else { |
| - if (equalsIgnoreTurkish("meta", localName)) { |
| + if ("meta".equals(localName)) { |
| String name = atts.getValue("name"); |
| if (name == null) { |
| name = atts.getValue("http-equiv"); |
| @@ -82,7 +83,7 @@
|
| } else if (inBODY > 0) { |
| if (SUPPRESS_ELEMENTS.contains(localName)) { |
| suppressed++; |
| - } else if (equalsIgnoreTurkish("img", localName)) { |
| + } else if ("img".equals(localName)) { |
| // the original javacc-based parser preserved <IMG alt="..."/> |
| // attribute as body text in [] parenthesis: |
| final String alt = atts.getValue("alt"); |
| @@ -90,11 +91,11 @@
|
| body.append('[').append(alt).append(']'); |
| } |
| } |
| - } else if (equalsIgnoreTurkish("body", localName)) { |
| + } else if ("body".equals(localName)) { |
| inBODY++; |
| - } else if (equalsIgnoreTurkish("head", localName)) { |
| + } else if ("head".equals(localName)) { |
| inHEAD++; |
| - } else if (equalsIgnoreTurkish("frameset", localName)) { |
| + } else if ("frameset".equals(localName)) { |
| throw new SAXException("This parser does not support HTML framesets."); |
| } |
| } |
| @@ -102,7 +103,7 @@
|
| @Override |
| public void endElement(String namespaceURI, String localName, String qName) throws SAXException { |
| if (inBODY > 0) { |
| - if (equalsIgnoreTurkish("body", localName)) { |
| + if ("body".equals(localName)) { |
| inBODY--; |
| } else if (ENDLINE_ELEMENTS.contains(localName)) { |
| body.append('\n'); |
| @@ -110,9 +111,9 @@
|
| suppressed--; |
| } |
| } else if (inHEAD > 0) { |
| - if (equalsIgnoreTurkish("head", localName)) { |
| + if ("head".equals(localName)) { |
| inHEAD--; |
| - } else if (inTITLE > 0 && equalsIgnoreTurkish("title", localName)) { |
| + } else if (inTITLE > 0 && "title".equals(localName)) { |
| inTITLE--; |
| } |
| } |
| @@ -145,38 +146,10 @@
|
| this.body = body.toString(); |
| } |
| |
| - // TODO: remove the Turkish workaround once this is fixed in NekoHTML: |
| - // https://sourceforge.net/tracker/?func=detail&aid=3544334&group_id=195122&atid=952178 |
| - |
| - // BEGIN: workaround |
| - static final String convertTurkish(String s) { |
| - return s.replace('i', 'ı'); |
| + private static final Set<String> createElementNameSet(String... names) { |
| + return Collections.unmodifiableSet(new HashSet<String>(Arrays.asList(names))); |
| } |
| |
| - static final boolean equalsIgnoreTurkish(String s1, String s2) { |
| - final int len1 = s1.length(), len2 = s2.length(); |
| - if (len1 != len2) |
| - return false; |
| - for (int i = 0; i < len1; i++) { |
| - char ch1 = s1.charAt(i), ch2 = s2.charAt(i); |
| - if (ch1 == 'ı') ch1 = 'i'; |
| - if (ch2 == 'ı') ch2 = 'i'; |
| - if (ch1 != ch2) |
| - return false; |
| - } |
| - return true; |
| - } |
| - // END: workaround |
| - |
| - static final Set<String> createElementNameSet(String... names) { |
| - final HashSet<String> set = new HashSet<String>(); |
| - for (final String name : names) { |
| - set.add(name); |
| - set.add(convertTurkish(name)); |
| - } |
| - return Collections.unmodifiableSet(set); |
| - } |
| - |
| /** HTML elements that cause a line break (they are block-elements) */ |
| static final Set<String> ENDLINE_ELEMENTS = createElementNameSet( |
| "p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl", |
| Index: lucene/licenses/nekohtml-1.9.15.jar.sha1
|
| ===================================================================
|
| --- lucene/licenses/nekohtml-1.9.15.jar.sha1 (revision 1417689)
|
| +++ lucene/licenses/nekohtml-1.9.15.jar.sha1 (working copy)
|
| @@ -1 +0,0 @@
|
| -a45cd7b7401d9c2264d4908182380452c03ebf8f |
| Index: lucene/licenses/nekohtml-1.9.17.jar.sha1
|
| ===================================================================
|
| --- lucene/licenses/nekohtml-1.9.17.jar.sha1 (revision 0)
|
| +++ lucene/licenses/nekohtml-1.9.17.jar.sha1 (working copy)
|
| @@ -0,0 +1 @@
|
| +39a870b0ea4cb0d2a3015c1ab569d17d83122d55 |
| Index: lucene/licenses/nekohtml-1.9.17.jar.sha1
|
| ===================================================================
|
| --- lucene/licenses/nekohtml-1.9.17.jar.sha1 (revision 0)
|
| +++ lucene/licenses/nekohtml-1.9.17.jar.sha1 (working copy)
|
|
|
| Property changes on: lucene/licenses/nekohtml-1.9.17.jar.sha1
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| ## -0,0 +1 ##
|
| +native
|
| \ No newline at end of property
|