blob: 4310749968484540aa95e7dc967c6d18824f32b7 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.tools;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.protocol.Content;
/**
* This class provides methods to map crawled data on JSON using a StringBuilder object.
* @see <a href='https://docs.oracle.com/javase/7/docs/api/java/lang/StringBuilder.html'>StringBuilder</a>
*
*/
public class CommonCrawlFormatSimple extends AbstractCommonCrawlFormat {
private StringBuilder sb;
private int tabCount;
public CommonCrawlFormatSimple(String url, Content content, Metadata metadata, Configuration nutchConf, CommonCrawlConfig config) throws IOException {
super(url, content, metadata, nutchConf, config);
this.sb = new StringBuilder();
this.tabCount = 0;
}
@Override
protected void writeKeyValue(String key, String value) throws IOException {
sb.append(printTabs() + "\"" + key + "\": " + quote(value) + ",\n");
}
@Override
protected void writeKeyNull(String key) throws IOException {
sb.append(printTabs() + "\"" + key + "\": null,\n");
}
@Override
protected void startArray(String key, boolean nested, boolean newline) throws IOException {
String name = (key != null) ? "\"" + key + "\": " : "";
String nl = (newline) ? "\n" : "";
sb.append(printTabs() + name + "[" + nl);
if (newline) {
this.tabCount++;
}
}
@Override
protected void closeArray(String key, boolean nested, boolean newline) throws IOException {
if (sb.charAt(sb.length()-1) == ',') {
sb.deleteCharAt(sb.length()-1); // delete comma
}
else if (sb.charAt(sb.length()-2) == ',') {
sb.deleteCharAt(sb.length()-2); // delete comma
}
String nl = (newline) ? printTabs() : "";
if (newline) {
this.tabCount++;
}
sb.append(nl + "],\n");
}
@Override
protected void writeArrayValue(String value) {
sb.append("\"" + value + "\",");
}
protected void startObject(String key) throws IOException {
String name = "";
if (key != null) {
name = "\"" + key + "\": ";
}
sb.append(printTabs() + name + "{\n");
this.tabCount++;
}
protected void closeObject(String key) throws IOException {
if (sb.charAt(sb.length()-2) == ',') {
sb.deleteCharAt(sb.length()-2); // delete comma
}
this.tabCount--;
sb.append(printTabs() + "},\n");
}
protected String generateJson() throws IOException {
sb.deleteCharAt(sb.length()-1); // delete new line
sb.deleteCharAt(sb.length()-1); // delete comma
return sb.toString();
}
private String printTabs() {
StringBuilder sb = new StringBuilder();
for (int i=0; i < this.tabCount ;i++) {
sb.append("\t");
}
return sb.toString();
}
private static String quote(String string) throws IOException {
StringBuilder sb = new StringBuilder();
if (string == null || string.length() == 0) {
sb.append("\"\"");
return sb.toString();
}
char b;
char c = 0;
String hhhh;
int i;
int len = string.length();
sb.append('"');
for (i = 0; i < len; i += 1) {
b = c;
c = string.charAt(i);
switch (c) {
case '\\':
case '"':
sb.append('\\');
sb.append(c);
break;
case '/':
if (b == '<') {
sb.append('\\');
}
sb.append(c);
break;
case '\b':
sb.append("\\b");
break;
case '\t':
sb.append("\\t");
break;
case '\n':
sb.append("\\n");
break;
case '\f':
sb.append("\\f");
break;
case '\r':
sb.append("\\r");
break;
default:
if (c < ' ' || (c >= '\u0080' && c < '\u00a0')
|| (c >= '\u2000' && c < '\u2100')) {
sb.append("\\u");
hhhh = Integer.toHexString(c);
sb.append("0000", 0, 4 - hhhh.length());
sb.append(hhhh);
} else {
sb.append(c);
}
}
}
sb.append('"');
return sb.toString();
}
}