| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.tools; |
| |
| import java.io.IOException; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.nutch.metadata.Metadata; |
| import org.apache.nutch.protocol.Content; |
| |
| /** |
| * This class provides methods to map crawled data on JSON using a StringBuilder object. |
| * @see <a href='https://docs.oracle.com/javase/7/docs/api/java/lang/StringBuilder.html'>StringBuilder</a> |
| * |
| */ |
| public class CommonCrawlFormatSimple extends AbstractCommonCrawlFormat { |
| |
| private StringBuilder sb; |
| |
| private int tabCount; |
| |
| public CommonCrawlFormatSimple(String url, Content content, Metadata metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException { |
| super(url, content, metadata, nutchConf, config); |
| |
| this.sb = new StringBuilder(); |
| this.tabCount = 0; |
| } |
| |
| @Override |
| protected void writeKeyValue(String key, String value) throws IOException { |
| sb.append(printTabs() + "\"" + key + "\": " + quote(value) + ",\n"); |
| } |
| |
| @Override |
| protected void writeKeyNull(String key) throws IOException { |
| sb.append(printTabs() + "\"" + key + "\": null,\n"); |
| } |
| |
| @Override |
| protected void startArray(String key, boolean nested, boolean newline) throws IOException { |
| String name = (key != null) ? "\"" + key + "\": " : ""; |
| String nl = (newline) ? "\n" : ""; |
| sb.append(printTabs() + name + "[" + nl); |
| if (newline) { |
| this.tabCount++; |
| } |
| } |
| |
| @Override |
| protected void closeArray(String key, boolean nested, boolean newline) throws IOException { |
| if (sb.charAt(sb.length()-1) == ',') { |
| sb.deleteCharAt(sb.length()-1); // delete comma |
| } |
| else if (sb.charAt(sb.length()-2) == ',') { |
| sb.deleteCharAt(sb.length()-2); // delete comma |
| } |
| String nl = (newline) ? printTabs() : ""; |
| if (newline) { |
| this.tabCount++; |
| } |
| sb.append(nl + "],\n"); |
| } |
| |
| @Override |
| protected void writeArrayValue(String value) { |
| sb.append("\"" + value + "\","); |
| } |
| |
| protected void startObject(String key) throws IOException { |
| String name = ""; |
| if (key != null) { |
| name = "\"" + key + "\": "; |
| } |
| sb.append(printTabs() + name + "{\n"); |
| this.tabCount++; |
| } |
| |
| protected void closeObject(String key) throws IOException { |
| if (sb.charAt(sb.length()-2) == ',') { |
| sb.deleteCharAt(sb.length()-2); // delete comma |
| } |
| this.tabCount--; |
| sb.append(printTabs() + "},\n"); |
| } |
| |
| protected String generateJson() throws IOException { |
| sb.deleteCharAt(sb.length()-1); // delete new line |
| sb.deleteCharAt(sb.length()-1); // delete comma |
| return sb.toString(); |
| } |
| |
| private String printTabs() { |
| StringBuilder sb = new StringBuilder(); |
| for (int i=0; i < this.tabCount ;i++) { |
| sb.append("\t"); |
| } |
| return sb.toString(); |
| } |
| |
| private static String quote(String string) throws IOException { |
| StringBuilder sb = new StringBuilder(); |
| |
| if (string == null || string.length() == 0) { |
| sb.append("\"\""); |
| return sb.toString(); |
| } |
| |
| char b; |
| char c = 0; |
| String hhhh; |
| int i; |
| int len = string.length(); |
| |
| sb.append('"'); |
| for (i = 0; i < len; i += 1) { |
| b = c; |
| c = string.charAt(i); |
| switch (c) { |
| case '\\': |
| case '"': |
| sb.append('\\'); |
| sb.append(c); |
| break; |
| case '/': |
| if (b == '<') { |
| sb.append('\\'); |
| } |
| sb.append(c); |
| break; |
| case '\b': |
| sb.append("\\b"); |
| break; |
| case '\t': |
| sb.append("\\t"); |
| break; |
| case '\n': |
| sb.append("\\n"); |
| break; |
| case '\f': |
| sb.append("\\f"); |
| break; |
| case '\r': |
| sb.append("\\r"); |
| break; |
| default: |
| if (c < ' ' || (c >= '\u0080' && c < '\u00a0') |
| || (c >= '\u2000' && c < '\u2100')) { |
| sb.append("\\u"); |
| hhhh = Integer.toHexString(c); |
| sb.append("0000", 0, 4 - hhhh.length()); |
| sb.append(hhhh); |
| } else { |
| sb.append(c); |
| } |
| } |
| } |
| sb.append('"'); |
| return sb.toString(); |
| } |
| } |