fix metadata concurency problem
diff --git a/parquet-hadoop/src/main/java/parquet/hadoop/metadata/Canonicalizer.java b/parquet-hadoop/src/main/java/parquet/hadoop/metadata/Canonicalizer.java
new file mode 100644
index 0000000..ece6e63
--- /dev/null
+++ b/parquet-hadoop/src/main/java/parquet/hadoop/metadata/Canonicalizer.java
@@ -0,0 +1,59 @@
+/**
+ * Copyright 2014 Twitter, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package parquet.hadoop.metadata;
+
+import java.util.concurrent.ConcurrentHashMap;
+
+/**
+ * returns canonical representation of objects (similar to String.intern()) to save memory
+ * if a.equals(b) then canonicalize(a) == canonicalize(b)
+ * this class is thread safe
+ * @author Julien Le Dem
+ *
+ * @param <T>
+ */
+public class Canonicalizer<T> {
+
+ private ConcurrentHashMap<T, T> canonicals = new ConcurrentHashMap<T, T>();
+
+ /**
+ * @param value the value to canonicalize
+ * @return the corresponding canonical value
+ */
+ final public T canonicalize(T value) {
+ T canonical = canonicals.get(value);
+ if (canonical == null) {
+ value = toCanonical(value);
+ T existing = canonicals.putIfAbsent(value, value);
+ // putIfAbsent is atomic, making sure we always return the same canonical representation of the value
+ if (existing == null) {
+ canonical = value;
+ } else {
+ canonical = existing;
+ }
+ }
+ return canonical;
+ }
+
+ /**
+ * @param value the value to canonicalize if needed
+ * @return the canonicalized value
+ */
+ protected T toCanonical(T value) {
+ return value;
+ }
+}
+
diff --git a/parquet-hadoop/src/main/java/parquet/hadoop/metadata/ColumnChunkProperties.java b/parquet-hadoop/src/main/java/parquet/hadoop/metadata/ColumnChunkProperties.java
index 3b74cec..bf6249f 100644
--- a/parquet-hadoop/src/main/java/parquet/hadoop/metadata/ColumnChunkProperties.java
+++ b/parquet-hadoop/src/main/java/parquet/hadoop/metadata/ColumnChunkProperties.java
@@ -16,8 +16,6 @@
package parquet.hadoop.metadata;
import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Map;
import java.util.Set;
import parquet.column.Encoding;
@@ -25,16 +23,10 @@
public class ColumnChunkProperties {
- private static Map<ColumnChunkProperties, ColumnChunkProperties> cache = new HashMap<ColumnChunkProperties, ColumnChunkProperties>();
+ private static Canonicalizer<ColumnChunkProperties> properties = new Canonicalizer<ColumnChunkProperties>();
public static ColumnChunkProperties get(ColumnPath path, PrimitiveTypeName type, CompressionCodecName codec, Set<Encoding> encodings) {
- ColumnChunkProperties key = new ColumnChunkProperties(codec, path, type, encodings);
- ColumnChunkProperties cached = cache.get(key);
- if (cached == null) {
- cached = key;
- cache.put(key, cached);
- }
- return cached;
+ return properties.canonicalize(new ColumnChunkProperties(codec, path, type, encodings));
}
private final CompressionCodecName codec;
diff --git a/parquet-hadoop/src/main/java/parquet/hadoop/metadata/ColumnPath.java b/parquet-hadoop/src/main/java/parquet/hadoop/metadata/ColumnPath.java
index e454eef..b179ae3 100644
--- a/parquet-hadoop/src/main/java/parquet/hadoop/metadata/ColumnPath.java
+++ b/parquet-hadoop/src/main/java/parquet/hadoop/metadata/ColumnPath.java
@@ -16,25 +16,22 @@
package parquet.hadoop.metadata;
import java.util.Arrays;
-import java.util.HashMap;
import java.util.Iterator;
-import java.util.Map;
public final class ColumnPath implements Iterable<String> {
- private static Map<ColumnPath, ColumnPath> paths = new HashMap<ColumnPath, ColumnPath>();
+ private static Canonicalizer<ColumnPath> paths = new Canonicalizer<ColumnPath>() {
+ protected ColumnPath toCanonical(ColumnPath value) {
+ String[] path = new String[value.p.length];
+ for (int i = 0; i < value.p.length; i++) {
+ path[i] = value.p[i].intern();
+ }
+ return new ColumnPath(path);
+ }
+ };
public static ColumnPath get(String... path){
- ColumnPath key = new ColumnPath(path);
- ColumnPath cached = paths.get(key);
- if (cached == null) {
- for (int i = 0; i < path.length; i++) {
- path[i] = path[i].intern();
- }
- cached = key;
- paths.put(key, cached);
- }
- return cached;
+ return paths.canonicalize(new ColumnPath(path));
}
private final String[] p;
diff --git a/parquet-hadoop/src/main/java/parquet/hadoop/metadata/EncodingList.java b/parquet-hadoop/src/main/java/parquet/hadoop/metadata/EncodingList.java
index 9d375c5..790b601 100644
--- a/parquet-hadoop/src/main/java/parquet/hadoop/metadata/EncodingList.java
+++ b/parquet-hadoop/src/main/java/parquet/hadoop/metadata/EncodingList.java
@@ -16,25 +16,17 @@
package parquet.hadoop.metadata;
import java.util.Collections;
-import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
-import java.util.Map;
import parquet.column.Encoding;
public class EncodingList implements Iterable<Encoding> {
- private static Map<EncodingList, EncodingList> encodingLists = new HashMap<EncodingList, EncodingList>();
+ private static Canonicalizer<EncodingList> encodingLists = new Canonicalizer<EncodingList>();
public static EncodingList getEncodingList(List<Encoding> encodings) {
- EncodingList key = new EncodingList(encodings);
- EncodingList cached = encodingLists.get(key);
- if (cached == null) {
- cached = key;
- encodingLists.put(key, cached);
- }
- return cached;
+ return encodingLists.canonicalize(new EncodingList(encodings));
}
private final List<Encoding> encodings;