hadoop-tools/hadoop-rumen/src/main/java/org/apache/hadoop/tools/rumen/datatypes/FileName.java - hadoop - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.hadoop.tools.rumen.datatypes;

 import java.net.URI;
 import java.net.URISyntaxException;
 import java.util.StringTokenizer;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.tools.rumen.anonymization.WordList;
 import org.apache.hadoop.tools.rumen.anonymization.WordListAnonymizerUtility;
 import org.apache.hadoop.tools.rumen.state.State;
 import org.apache.hadoop.tools.rumen.state.StatePool;
 import org.apache.hadoop.util.StringUtils;

 /**
  * Represents a file's location.
  *
  * Currently, only filenames that can be represented using {@link Path} are
  * supported.
  */
 public class FileName implements AnonymizableDataType<String> {
   private final String fileName;
   private String anonymizedFileName;
   private static final String PREV_DIR = "..";
   private static final String[] KNOWN_SUFFIXES =
     new String[] {".xml", ".jar", ".txt", ".tar", ".zip", ".json", ".gzip",
                   ".lzo"};

   /**
    * A composite state for filename.
    */
   public static class FileNameState implements State {
     private WordList dirState = new WordList("dir");
     private WordList fileNameState =  new WordList("file");

     @Override
     public boolean isUpdated() {
       return dirState.isUpdated() || fileNameState.isUpdated();
     }

     public WordList getDirectoryState() {
       return dirState;
     }

     public WordList getFileNameState() {
       return fileNameState;
     }

     public void setDirectoryState(WordList state) {
       this.dirState = state;
     }

     public void setFileNameState(WordList state) {
       this.fileNameState = state;
     }

     @Override
     public String getName() {
       return "path";
     }

     @Override
     public void setName(String name) {
       // for now, simply assert since this class has a hardcoded name
       if (!getName().equals(name)) {
         throw new RuntimeException("State name mismatch! Expected '"
                                    + getName() + "' but found '" + name + "'.");
       }
     }
   }

   public FileName(String fileName) {
     this.fileName = fileName;
   }

   @Override
   public String getValue() {
     return fileName;
   }

   @Override
   public String getAnonymizedValue(StatePool statePool,
                                    Configuration conf) {
     if (anonymizedFileName == null) {
       anonymize(statePool, conf);
     }
     return anonymizedFileName;
   }

   private void anonymize(StatePool statePool, Configuration conf) {
     FileNameState fState = (FileNameState) statePool.getState(getClass());
     if (fState == null) {
       fState = new FileNameState();
       statePool.addState(getClass(), fState);
     }

     String[] files = StringUtils.split(fileName);
     String[] anonymizedFileNames = new String[files.length];
     int i = 0;
     for (String f : files) {
       anonymizedFileNames[i++] =
         anonymize(statePool, conf, fState, f);
     }

     anonymizedFileName = StringUtils.arrayToString(anonymizedFileNames);
   }

   private static String anonymize(StatePool statePool, Configuration conf,
                                   FileNameState fState, String fileName) {
     String ret = null;
     try {
       URI uri = new URI(fileName);

       // anonymize the path i.e without the authority & scheme
       ret =
         anonymizePath(uri.getPath(), fState.getDirectoryState(),
                       fState.getFileNameState());

       // anonymize the authority and scheme
       String authority = uri.getAuthority();
       String scheme = uri.getScheme();
       if (scheme != null) {
         String anonymizedAuthority = "";
         if (authority != null) {
           // anonymize the authority
           NodeName hostName = new NodeName(null, uri.getHost());
           anonymizedAuthority = hostName.getAnonymizedValue(statePool, conf);
         }
         ret = scheme + "://" + anonymizedAuthority + ret;
       }
     } catch (URISyntaxException use) {
       throw new RuntimeException (use);
     }

     return ret;
   }

   // Anonymize the file-path
   private static String anonymizePath(String path, WordList dState,
                                       WordList fState) {
     StringBuilder buffer = new StringBuilder();
     StringTokenizer tokenizer = new StringTokenizer(path, Path.SEPARATOR, true);
     while (tokenizer.hasMoreTokens()) {
       String token = tokenizer.nextToken();
       if (Path.SEPARATOR.equals(token)) {
         buffer.append(token);
       } else if (Path.CUR_DIR.equals(token)) {
         buffer.append(token);
       } else if (PREV_DIR.equals(token)) {
         buffer.append(token);
       } else if (tokenizer.hasMoreTokens()){
         // this component is a directory
         buffer.append(anonymize(token, dState));
       } else {
         // this component is a file
         buffer.append(anonymize(token, fState));
       }
     }

     return buffer.toString();
   }

   //TODO There is no caching for saving memory.
   private static String anonymize(String data, WordList wordList) {
     if (data == null) {
       return null;
     }

     if (WordListAnonymizerUtility.needsAnonymization(data)) {
       String suffix = "";
       String coreData = data;
       // check and extract suffix
       if (WordListAnonymizerUtility.hasSuffix(data, KNOWN_SUFFIXES)) {
         // check if the data ends with a known suffix
         String[] split =
           WordListAnonymizerUtility.extractSuffix(data, KNOWN_SUFFIXES);
         suffix = split[1];
         coreData = split[0];
       }

       // check if the data is known content
       //TODO [Chunking] Do this for sub-strings of data
       String anonymizedData = coreData;
       if (!WordListAnonymizerUtility.isKnownData(coreData)) {
         if (!wordList.contains(coreData)) {
           wordList.add(coreData);
         }
         anonymizedData  = wordList.getName() + wordList.indexOf(coreData);
       }

       return anonymizedData + suffix;
     } else {
       return data;
     }
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.hadoop.tools.rumen.datatypes;

	import java.net.URI;
	import java.net.URISyntaxException;
	import java.util.StringTokenizer;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.tools.rumen.anonymization.WordList;
	import org.apache.hadoop.tools.rumen.anonymization.WordListAnonymizerUtility;
	import org.apache.hadoop.tools.rumen.state.State;
	import org.apache.hadoop.tools.rumen.state.StatePool;
	import org.apache.hadoop.util.StringUtils;

	/**
	* Represents a file's location.
	*
	* Currently, only filenames that can be represented using {@link Path} are
	* supported.
	*/
	public class FileName implements AnonymizableDataType<String> {
	private final String fileName;
	private String anonymizedFileName;
	private static final String PREV_DIR = "..";
	private static final String[] KNOWN_SUFFIXES =
	new String[] {".xml", ".jar", ".txt", ".tar", ".zip", ".json", ".gzip",
	".lzo"};

	/**
	* A composite state for filename.
	*/
	public static class FileNameState implements State {
	private WordList dirState = new WordList("dir");
	private WordList fileNameState = new WordList("file");

	@Override
	public boolean isUpdated() {
	return dirState.isUpdated() \|\| fileNameState.isUpdated();
	}

	public WordList getDirectoryState() {
	return dirState;
	}

	public WordList getFileNameState() {
	return fileNameState;
	}

	public void setDirectoryState(WordList state) {
	this.dirState = state;
	}

	public void setFileNameState(WordList state) {
	this.fileNameState = state;
	}

	@Override
	public String getName() {
	return "path";
	}

	@Override
	public void setName(String name) {
	// for now, simply assert since this class has a hardcoded name
	if (!getName().equals(name)) {
	throw new RuntimeException("State name mismatch! Expected '"
	+ getName() + "' but found '" + name + "'.");
	}
	}
	}

	public FileName(String fileName) {
	this.fileName = fileName;
	}

	@Override
	public String getValue() {
	return fileName;
	}

	@Override
	public String getAnonymizedValue(StatePool statePool,
	Configuration conf) {
	if (anonymizedFileName == null) {
	anonymize(statePool, conf);
	}
	return anonymizedFileName;
	}

	private void anonymize(StatePool statePool, Configuration conf) {
	FileNameState fState = (FileNameState) statePool.getState(getClass());
	if (fState == null) {
	fState = new FileNameState();
	statePool.addState(getClass(), fState);
	}

	String[] files = StringUtils.split(fileName);
	String[] anonymizedFileNames = new String[files.length];
	int i = 0;
	for (String f : files) {
	anonymizedFileNames[i++] =
	anonymize(statePool, conf, fState, f);
	}

	anonymizedFileName = StringUtils.arrayToString(anonymizedFileNames);
	}

	private static String anonymize(StatePool statePool, Configuration conf,
	FileNameState fState, String fileName) {
	String ret = null;
	try {
	URI uri = new URI(fileName);

	// anonymize the path i.e without the authority & scheme
	ret =
	anonymizePath(uri.getPath(), fState.getDirectoryState(),
	fState.getFileNameState());

	// anonymize the authority and scheme
	String authority = uri.getAuthority();
	String scheme = uri.getScheme();
	if (scheme != null) {
	String anonymizedAuthority = "";
	if (authority != null) {
	// anonymize the authority
	NodeName hostName = new NodeName(null, uri.getHost());
	anonymizedAuthority = hostName.getAnonymizedValue(statePool, conf);
	}
	ret = scheme + "://" + anonymizedAuthority + ret;
	}
	} catch (URISyntaxException use) {
	throw new RuntimeException (use);
	}

	return ret;
	}

	// Anonymize the file-path
	private static String anonymizePath(String path, WordList dState,
	WordList fState) {
	StringBuilder buffer = new StringBuilder();
	StringTokenizer tokenizer = new StringTokenizer(path, Path.SEPARATOR, true);
	while (tokenizer.hasMoreTokens()) {
	String token = tokenizer.nextToken();
	if (Path.SEPARATOR.equals(token)) {
	buffer.append(token);
	} else if (Path.CUR_DIR.equals(token)) {
	buffer.append(token);
	} else if (PREV_DIR.equals(token)) {
	buffer.append(token);
	} else if (tokenizer.hasMoreTokens()){
	// this component is a directory
	buffer.append(anonymize(token, dState));
	} else {
	// this component is a file
	buffer.append(anonymize(token, fState));
	}
	}

	return buffer.toString();
	}

	//TODO There is no caching for saving memory.
	private static String anonymize(String data, WordList wordList) {
	if (data == null) {
	return null;
	}

	if (WordListAnonymizerUtility.needsAnonymization(data)) {
	String suffix = "";
	String coreData = data;
	// check and extract suffix
	if (WordListAnonymizerUtility.hasSuffix(data, KNOWN_SUFFIXES)) {
	// check if the data ends with a known suffix
	String[] split =
	WordListAnonymizerUtility.extractSuffix(data, KNOWN_SUFFIXES);
	suffix = split[1];
	coreData = split[0];
	}

	// check if the data is known content
	//TODO [Chunking] Do this for sub-strings of data
	String anonymizedData = coreData;
	if (!WordListAnonymizerUtility.isKnownData(coreData)) {
	if (!wordList.contains(coreData)) {
	wordList.add(coreData);
	}
	anonymizedData = wordList.getName() + wordList.indexOf(coreData);
	}

	return anonymizedData + suffix;
	} else {
	return data;
	}
	}
	}