SQOOP-3074: Fix Avro import not to fail with Javac
errors in case of non UTF-8 locale
(Attila Szabo)
diff --git a/src/java/org/apache/sqoop/avro/AvroUtil.java b/src/java/org/apache/sqoop/avro/AvroUtil.java
index ee29f14..8d90130 100644
--- a/src/java/org/apache/sqoop/avro/AvroUtil.java
+++ b/src/java/org/apache/sqoop/avro/AvroUtil.java
@@ -28,6 +28,7 @@
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.avro.mapred.FsInput;
+import org.apache.commons.lang.StringEscapeUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
@@ -106,7 +107,10 @@
* Convert Column name into Avro column name.
*/
public static String toAvroColumn(String column) {
- String candidate = ClassWriter.toJavaIdentifier(column);
+ // We're unescaping identifiers to get the real Unicode characters
+ // back, and not the escaped versions.
+ String candidate = StringEscapeUtils.unescapeJava(
+ ClassWriter.toJavaIdentifier(column));
return toAvroIdentifier(candidate);
}
diff --git a/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java b/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java
index 3c31c43..5b1c745 100644
--- a/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java
+++ b/src/java/org/apache/sqoop/orm/AvroSchemaGenerator.java
@@ -29,6 +29,7 @@
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.Schema.Type;
+import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -88,7 +89,9 @@
List<Field> fields = new ArrayList<Field>();
for (String columnName : columnNames) {
- String cleanedCol = AvroUtil.toAvroIdentifier(ClassWriter.toJavaIdentifier(columnName));
+ // We're unescaping identifiers to get the real Unicode characters
+ // back, and not the escaped versions.
+ String cleanedCol = AvroUtil.toAvroIdentifier(StringEscapeUtils.unescapeJava(ClassWriter.toJavaIdentifier(columnName)));
List<Integer> columnInfoList = columnInfo.get(columnName);
int sqlType = columnInfoList.get(0);
Integer precision = columnInfoList.get(1);
diff --git a/src/java/org/apache/sqoop/orm/ClassWriter.java b/src/java/org/apache/sqoop/orm/ClassWriter.java
index 6f6e66b..0c8d86d 100644
--- a/src/java/org/apache/sqoop/orm/ClassWriter.java
+++ b/src/java/org/apache/sqoop/orm/ClassWriter.java
@@ -284,7 +284,16 @@
return "_" + output;
}
- return output;
+ // Calling StringEscapeUtils#escapeJava is required because we'd like to
+ // support Unicode characters in identifiers even if the locale of the host
+ // system is not supporting UTF-8, or by any reason the locale is different
+ // from that. Good example: if a column name would contain a \uC3A1 char
+ // in it's name, though the locale would not support Unicode characters
+ // then the generated java file would contain unrecognizable characters
+ // for the compiler, and javac would fail with a compile error. If the name
+ // of the column would be Alm\uC3A1a then it would be Alm\uC3A1a after the
+ // escaping, and this every places where it's used/
+ return StringEscapeUtils.escapeJava(output);
}
private String toJavaType(String columnName, int sqlType) {