src/java/org/apache/cassandra/db/Table.java - cassandra - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.cassandra.db;

 import java.io.File;
 import java.io.IOError;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.SortedSet;
 import java.util.TreeSet;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Future;
 import java.util.concurrent.locks.ReentrantReadWriteLock;

 import org.apache.cassandra.config.*;
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.filter.QueryFilter;
 import org.apache.cassandra.db.filter.QueryPath;
 import org.apache.cassandra.io.sstable.SSTableDeletingTask;
 import org.apache.cassandra.io.sstable.SSTableReader;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.io.util.MmappedSegmentedFile;
 import org.apache.cassandra.locator.AbstractReplicationStrategy;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.NodeId;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import com.google.common.base.Function;
 import com.google.common.collect.Iterables;

 /**
  * It represents a Keyspace.
  */
 public class Table
 {
     public static final String SYSTEM_TABLE = "system";

     public static final String SNAPSHOT_SUBDIR_NAME = "snapshots";

     private static final Logger logger = LoggerFactory.getLogger(Table.class);

     /**
      * accesses to CFS.memtable should acquire this for thread safety.
      * Table.maybeSwitchMemtable should aquire the writeLock; see that method for the full explanation.
      *
      * (Enabling fairness in the RRWL is observed to decrease throughput, so we leave it off.)
      */
     static final ReentrantReadWriteLock switchLock = new ReentrantReadWriteLock();

     // It is possible to call Table.open without a running daemon, so it makes sense to ensure
     // proper directories here as well as in CassandraDaemon.
     static
     {
         if (!StorageService.instance.isClientMode())
         {
             try
             {
                 DatabaseDescriptor.createAllDirectories();
             }
             catch (IOException ex)
             {
                 throw new IOError(ex);
             }
         }
     }

     /* Table name. */
     public final String name;
     /* ColumnFamilyStore per column family */
     private final Map<Integer, ColumnFamilyStore> columnFamilyStores = new ConcurrentHashMap<Integer, ColumnFamilyStore>();
     private final Object[] indexLocks;
     private volatile AbstractReplicationStrategy replicationStrategy;

     public static Table open(String table)
     {
         return open(table, Schema.instance);
     }

     public static Table open(String table, Schema schema)
     {
         Table tableInstance = schema.getTableInstance(table);

         if (tableInstance == null)
         {
             // instantiate the Table.  we could use putIfAbsent but it's important to making sure it is only done once
             // per keyspace, so we synchronize and re-check before doing it.
             synchronized (Table.class)
             {
                 tableInstance = schema.getTableInstance(table);
                 if (tableInstance == null)
                 {
                     // open and store the table
                     tableInstance = new Table(table);
                     schema.storeTableInstance(tableInstance);

                     //table has to be constructed and in the cache before cacheRow can be called
                     for (ColumnFamilyStore cfs : tableInstance.getColumnFamilyStores())
                         cfs.initCaches();
                 }
             }
         }
         return tableInstance;
     }

     public static Table clear(String table) throws IOException
     {
         return clear(table, Schema.instance);
     }

     public static Table clear(String table, Schema schema) throws IOException
     {
         synchronized (Table.class)
         {
             Table t = schema.removeTableInstance(table);
             if (t != null)
             {
                 for (ColumnFamilyStore cfs : t.getColumnFamilyStores())
                     t.unloadCf(cfs);
             }
             return t;
         }
     }

     public Collection<ColumnFamilyStore> getColumnFamilyStores()
     {
         return Collections.unmodifiableCollection(columnFamilyStores.values());
     }

     public ColumnFamilyStore getColumnFamilyStore(String cfName)
     {
         Integer id = Schema.instance.getId(name, cfName);
         if (id == null)
             throw new IllegalArgumentException(String.format("Unknown table/cf pair (%s.%s)", name, cfName));
         return getColumnFamilyStore(id);
     }

     public ColumnFamilyStore getColumnFamilyStore(Integer id)
     {
         ColumnFamilyStore cfs = columnFamilyStores.get(id);
         if (cfs == null)
             throw new IllegalArgumentException("Unknown CF " + id);
         return cfs;
     }

     /**
      * Do a cleanup of keys that do not belong locally.
      */
     public void forceCleanup(NodeId.OneShotRenewer renewer) throws IOException, ExecutionException, InterruptedException
     {
         if (name.equals(SYSTEM_TABLE))
             throw new UnsupportedOperationException("Cleanup of the system table is neither necessary nor wise");

         // Sort the column families in order of SSTable size, so cleanup of smaller CFs
         // can free up space for larger ones
         List<ColumnFamilyStore> sortedColumnFamilies = new ArrayList<ColumnFamilyStore>(columnFamilyStores.values());
         Collections.sort(sortedColumnFamilies, new Comparator<ColumnFamilyStore>()
         {
             // Compare first on size and, if equal, sort by name (arbitrary & deterministic).
             public int compare(ColumnFamilyStore cf1, ColumnFamilyStore cf2)
             {
                 long diff = (cf1.getTotalDiskSpaceUsed() - cf2.getTotalDiskSpaceUsed());
                 if (diff > 0)
                     return 1;
                 if (diff < 0)
                     return -1;
                 return cf1.columnFamily.compareTo(cf2.columnFamily);
             }
         });

         // Cleanup in sorted order to free up space for the larger ones
         for (ColumnFamilyStore cfs : sortedColumnFamilies)
             cfs.forceCleanup(renewer);
     }

     /**
      * Take a snapshot of the entire set of column families with a given timestamp
      *
      * @param snapshotName the tag associated with the name of the snapshot.  This value may not be null
      */
     public void snapshot(String snapshotName)
     {
         assert snapshotName != null;
         for (ColumnFamilyStore cfStore : columnFamilyStores.values())
             cfStore.snapshot(snapshotName);
     }

     /**
      * @param clientSuppliedName may be null.
      * @return
      */
     public static String getTimestampedSnapshotName(String clientSuppliedName)
     {
         String snapshotName = Long.toString(System.currentTimeMillis());
         if (clientSuppliedName != null && !clientSuppliedName.equals(""))
         {
             snapshotName = snapshotName + "-" + clientSuppliedName;
         }
         return snapshotName;
     }

     /**?
      * Clear snapshots for this table. If no tag is given we will clear all
      * snapshots
      *
      * @param snapshotName the user supplied snapshot name
      * @return true if the snapshot exists
      */
     public boolean snapshotExists(String snapshotName)
     {
         for (String dataDirPath : DatabaseDescriptor.getAllDataFileLocations())
         {
             String snapshotPath = dataDirPath + File.separator + name + File.separator + SNAPSHOT_SUBDIR_NAME + File.separator + snapshotName;
             File snapshot = new File(snapshotPath);
             if (snapshot.exists())
             {
                 return true;
             }
         }
         return false;
     }

     /**
      * Clear all the snapshots for a given table.
      */
     public void clearSnapshot(String tag) throws IOException
     {
         for (String dataDirPath : DatabaseDescriptor.getAllDataFileLocations())
         {
             // If tag is empty we will delete the entire snapshot directory
             String snapshotPath = dataDirPath + File.separator + name + File.separator + SNAPSHOT_SUBDIR_NAME + File.separator + tag;
             File snapshotDir = new File(snapshotPath);
             if (snapshotDir.exists())
             {
                 if (logger.isDebugEnabled())
                     logger.debug("Removing snapshot directory " + snapshotPath);
                 FileUtils.deleteRecursive(snapshotDir);
             }
         }
     }

     /**
      * @return A list of open SSTableReaders
      */
     public List<SSTableReader> getAllSSTables()
     {
         List<SSTableReader> list = new ArrayList<SSTableReader>();
         for (ColumnFamilyStore cfStore : columnFamilyStores.values())
             list.addAll(cfStore.getSSTables());
         return list;
     }

     private Table(String table)
     {
         name = table;
         KSMetaData ksm = Schema.instance.getKSMetaData(table);
         assert ksm != null : "Unknown keyspace " + table;
         try
         {
             createReplicationStrategy(ksm);
         }
         catch (ConfigurationException e)
         {
             throw new RuntimeException(e);
         }

         indexLocks = new Object[DatabaseDescriptor.getConcurrentWriters() * 128];
         for (int i = 0; i < indexLocks.length; i++)
             indexLocks[i] = new Object();
         // create data directories.
         for (String dataDir : DatabaseDescriptor.getAllDataFileLocations())
         {
             try
             {
                 String keyspaceDir = dataDir + File.separator + table;
                 if (!StorageService.instance.isClientMode())
                     FileUtils.createDirectory(keyspaceDir);

                 // remove the deprecated streaming directory.
                 File streamingDir = new File(keyspaceDir, "stream");
                 if (streamingDir.exists())
                     FileUtils.deleteRecursive(streamingDir);
             }
             catch (IOException ex)
             {
                 throw new IOError(ex);
             }
         }

         for (CFMetaData cfm : new ArrayList<CFMetaData>(Schema.instance.getTableDefinition(table).cfMetaData().values()))
         {
             logger.debug("Initializing {}.{}", name, cfm.cfName);
             initCf(cfm.cfId, cfm.cfName);
         }

     }

     public void createReplicationStrategy(KSMetaData ksm) throws ConfigurationException
     {
         if (replicationStrategy != null)
             StorageService.instance.getTokenMetadata().unregister(replicationStrategy);

         replicationStrategy = AbstractReplicationStrategy.createReplicationStrategy(ksm.name,
                                                                                     ksm.strategyClass,
                                                                                     StorageService.instance.getTokenMetadata(),
                                                                                     DatabaseDescriptor.getEndpointSnitch(),
                                                                                     ksm.strategyOptions);
     }

     // best invoked on the compaction mananger.
     public void dropCf(Integer cfId) throws IOException
     {
         assert columnFamilyStores.containsKey(cfId);
         ColumnFamilyStore cfs = columnFamilyStores.remove(cfId);
         if (cfs == null)
             return;

         unloadCf(cfs);
     }

     // disassociate a cfs from this table instance.
     private void unloadCf(ColumnFamilyStore cfs) throws IOException
     {
         try
         {
             cfs.forceBlockingFlush();
         }
         catch (ExecutionException e)
         {
             throw new IOException(e);
         }
         catch (InterruptedException e)
         {
             throw new IOException(e);
         }
         cfs.invalidate();
     }

     /** adds a cf to internal structures, ends up creating disk files). */
     public void initCf(Integer cfId, String cfName)
     {
         assert !columnFamilyStores.containsKey(cfId) : String.format("tried to init %s as %s, but already used by %s",
                                                                      cfName, cfId, columnFamilyStores.get(cfId));
         columnFamilyStores.put(cfId, ColumnFamilyStore.createColumnFamilyStore(this, cfName));
     }

     public Row getRow(QueryFilter filter) throws IOException
     {
         ColumnFamilyStore cfStore = getColumnFamilyStore(filter.getColumnFamilyName());
         ColumnFamily columnFamily = cfStore.getColumnFamily(filter, ArrayBackedSortedColumns.factory());
         return new Row(filter.key, columnFamily);
     }

     /**
      * This method adds the row to the Commit Log associated with this table.
      * Once this happens the data associated with the individual column families
      * is also written to the column family store's memtable.
     */
     public void apply(RowMutation mutation, boolean writeCommitLog) throws IOException
     {
         if (logger.isDebugEnabled())
             logger.debug("applying mutation of row {}", ByteBufferUtil.bytesToHex(mutation.key()));

         // write the mutation to the commitlog and memtables
         switchLock.readLock().lock();
         try
         {
             if (writeCommitLog)
                 CommitLog.instance.add(mutation);

             DecoratedKey<?> key = StorageService.getPartitioner().decorateKey(mutation.key());
             for (ColumnFamily cf : mutation.getColumnFamilies())
             {
                 ColumnFamilyStore cfs = columnFamilyStores.get(cf.id());
                 if (cfs == null)
                 {
                     logger.error("Attempting to mutate non-existant column family " + cf.id());
                     continue;
                 }

                 SortedSet<ByteBuffer> mutatedIndexedColumns = null;
                 for (ByteBuffer column : cfs.indexManager.getIndexedColumns())
                 {
                     if (cf.getColumnNames().contains(column) || cf.isMarkedForDelete())
                     {
                         if (mutatedIndexedColumns == null)
                             mutatedIndexedColumns = new TreeSet<ByteBuffer>();
                         mutatedIndexedColumns.add(column);
                         if (logger.isDebugEnabled())
                         {
                             // can't actually use validator to print value here, because we overload value
                             // for deletion timestamp as well (which may not be a well-formed value for the column type)
                             ByteBuffer value = cf.getColumn(column) == null ? null : cf.getColumn(column).value(); // may be null on row-level deletion
                             logger.debug(String.format("mutating indexed column %s value %s",
                                                        cf.getComparator().getString(column),
                                                        value == null ? "null" : ByteBufferUtil.bytesToHex(value)));
                         }
                     }
                 }

                 // Sharding the lock is insufficient to avoid contention when there is a "hot" row, e.g., for
                 // hint writes when a node is down (keyed by target IP).  So it is worth special-casing the
                 // no-index case to avoid the synchronization.
                 if (mutatedIndexedColumns == null)
                 {
                     cfs.apply(key, cf);
                     continue;
                 }
                 // else mutatedIndexedColumns != null
                 synchronized (indexLockFor(mutation.key()))
                 {
                     // with the raw data CF, we can just apply every update in any order and let
                     // read-time resolution throw out obsolete versions, thus avoiding read-before-write.
                     // but for indexed data we need to make sure that we're not creating index entries
                     // for obsolete writes.
                     ColumnFamily oldIndexedColumns = readCurrentIndexedColumns(key, cfs, mutatedIndexedColumns);
                     logger.debug("Pre-mutation index row is {}", oldIndexedColumns);
                     ignoreObsoleteMutations(cf, mutatedIndexedColumns, oldIndexedColumns);

                     cfs.apply(key, cf);

                     // ignore full index memtables -- we flush those when the "master" one is full
                     cfs.indexManager.applyIndexUpdates(mutation.key(), cf, mutatedIndexedColumns, oldIndexedColumns);
                 }
             }
         }
         finally
         {
             switchLock.readLock().unlock();
         }

     }

     private static void ignoreObsoleteMutations(ColumnFamily cf, SortedSet<ByteBuffer> mutatedIndexedColumns, ColumnFamily oldIndexedColumns)
     {
         // DO NOT modify the cf object here, it can race w/ the CL write (see https://issues.apache.org/jira/browse/CASSANDRA-2604)

         if (oldIndexedColumns == null)
             return;

         for (Iterator<ByteBuffer> iter = mutatedIndexedColumns.iterator(); iter.hasNext(); )
         {
             ByteBuffer name = iter.next();
             IColumn newColumn = cf.getColumn(name); // null == row delete or it wouldn't be marked Mutated
             if (newColumn != null && cf.isMarkedForDelete())
             {
                 // row is marked for delete, but column was also updated.  if column is timestamped less than
                 // the row tombstone, treat it as if it didn't exist.  Otherwise we don't care about row
                 // tombstone for the purpose of the index update and we can proceed as usual.
                 if (newColumn.timestamp() <= cf.getMarkedForDeleteAt())
                 {
                     // don't remove from the cf object; that can race w/ CommitLog write.  Leaving it is harmless.
                     newColumn = null;
                 }
             }
             IColumn oldColumn = oldIndexedColumns.getColumn(name);

             // deletions are irrelevant to the index unless we're changing state from live -> deleted, i.e.,
             // just updating w/ a newer tombstone doesn't matter
             boolean bothDeleted = (newColumn == null || newColumn.isMarkedForDelete())
                                   && (oldColumn == null || oldColumn.isMarkedForDelete());
             // obsolete means either the row or the column timestamp we're applying is older than existing data
             boolean obsoleteRowTombstone = newColumn == null && oldColumn != null && cf.getMarkedForDeleteAt() < oldColumn.timestamp();
             boolean obsoleteColumn = newColumn != null && (newColumn.timestamp() <= oldIndexedColumns.getMarkedForDeleteAt()
                                                            || (oldColumn != null && oldColumn.reconcile(newColumn) == oldColumn));
             if (bothDeleted || obsoleteRowTombstone || obsoleteColumn)
             {
                 if (logger.isDebugEnabled())
                     logger.debug("skipping index update for obsolete mutation of " + cf.getComparator().getString(name));
                 iter.remove();
                 oldIndexedColumns.remove(name);
             }
         }
     }

     private static ColumnFamily readCurrentIndexedColumns(DecoratedKey<?> key, ColumnFamilyStore cfs, SortedSet<ByteBuffer> mutatedIndexedColumns)
     {
         QueryFilter filter = QueryFilter.getNamesFilter(key, new QueryPath(cfs.getColumnFamilyName()), mutatedIndexedColumns);
         return cfs.getColumnFamily(filter);
     }

     public AbstractReplicationStrategy getReplicationStrategy()
     {
         return replicationStrategy;
     }

     public static void indexRow(DecoratedKey<?> key, ColumnFamilyStore cfs, SortedSet<ByteBuffer> indexedColumns)
     {
         if (logger.isDebugEnabled())
             logger.debug("Indexing row {} ", cfs.metadata.getKeyValidator().getString(key.key));

         switchLock.readLock().lock();
         try
         {
             synchronized (cfs.table.indexLockFor(key.key))
             {
                 ColumnFamily cf = readCurrentIndexedColumns(key, cfs, indexedColumns);
                 if (cf != null)
                     try
                     {
                         cfs.indexManager.applyIndexUpdates(key.key, cf, cf.getColumnNames(), null);
                     }
                     catch (IOException e)
                     {
                         throw new IOError(e);
                     }
             }
         }
         finally
         {
             switchLock.readLock().unlock();
         }
     }

     private Object indexLockFor(ByteBuffer key)
     {
         return indexLocks[Math.abs(key.hashCode() % indexLocks.length)];
     }

     public List<Future<?>> flush() throws IOException
     {
         List<Future<?>> futures = new ArrayList<Future<?>>();
         for (Integer cfId : columnFamilyStores.keySet())
         {
             Future<?> future = columnFamilyStores.get(cfId).forceFlush();
             if (future != null)
                 futures.add(future);
         }
         return futures;
     }

     public String getDataFileLocation(long expectedSize)
     {
         String path = DatabaseDescriptor.getDataFileLocationForTable(name, expectedSize);
         // Requesting GC has a chance to free space only if we're using mmap and a non SUN jvm
         if (path == null
          && (DatabaseDescriptor.getDiskAccessMode() == Config.DiskAccessMode.mmap || DatabaseDescriptor.getIndexAccessMode() == Config.DiskAccessMode.mmap)
          && !MmappedSegmentedFile.isCleanerAvailable())
         {
             StorageService.instance.requestGC();
             // retry after GCing has forced unmap of compacted SSTables so they can be deleted
             // Note: GCInspector will do this already, but only sun JVM supports GCInspector so far
             SSTableDeletingTask.rescheduleFailedTasks();
             try
             {
                 Thread.sleep(10000);
             }
             catch (InterruptedException e)
             {
                 throw new AssertionError(e);
             }
             path = DatabaseDescriptor.getDataFileLocationForTable(name, expectedSize);
         }
         return path;
     }

     public static String getSnapshotPath(String dataDirPath, String tableName, String snapshotName)
     {
         return getSnapshotPath(dataDirPath + File.separator + tableName, snapshotName);
     }

     public static String getSnapshotPath(String tableDirectory, String snapshotName)
     {
         return tableDirectory + File.separator + SNAPSHOT_SUBDIR_NAME + File.separator + snapshotName;
     }

     public static Iterable<Table> all()
     {
         Function<String, Table> transformer = new Function<String, Table>()
         {
             public Table apply(String tableName)
             {
                 return Table.open(tableName);
             }
         };
         return Iterables.transform(Schema.instance.getTables(), transformer);
     }

     @Override
     public String toString()
     {
         return getClass().getSimpleName() + "(name='" + name + "')";
     }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.cassandra.db;

	import java.io.File;
	import java.io.IOError;
	import java.io.IOException;
	import java.nio.ByteBuffer;
	import java.util.ArrayList;
	import java.util.Collection;
	import java.util.Collections;
	import java.util.Comparator;
	import java.util.Iterator;
	import java.util.List;
	import java.util.Map;
	import java.util.SortedSet;
	import java.util.TreeSet;
	import java.util.concurrent.ConcurrentHashMap;
	import java.util.concurrent.ExecutionException;
	import java.util.concurrent.Future;
	import java.util.concurrent.locks.ReentrantReadWriteLock;

	import org.apache.cassandra.config.*;
	import org.apache.cassandra.db.commitlog.CommitLog;
	import org.apache.cassandra.db.filter.QueryFilter;
	import org.apache.cassandra.db.filter.QueryPath;
	import org.apache.cassandra.io.sstable.SSTableDeletingTask;
	import org.apache.cassandra.io.sstable.SSTableReader;
	import org.apache.cassandra.io.util.FileUtils;
	import org.apache.cassandra.io.util.MmappedSegmentedFile;
	import org.apache.cassandra.locator.AbstractReplicationStrategy;
	import org.apache.cassandra.service.StorageService;
	import org.apache.cassandra.utils.ByteBufferUtil;
	import org.apache.cassandra.utils.NodeId;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	import com.google.common.base.Function;
	import com.google.common.collect.Iterables;

	/**
	* It represents a Keyspace.
	*/
	public class Table
	{
	public static final String SYSTEM_TABLE = "system";

	public static final String SNAPSHOT_SUBDIR_NAME = "snapshots";

	private static final Logger logger = LoggerFactory.getLogger(Table.class);

	/**
	* accesses to CFS.memtable should acquire this for thread safety.
	* Table.maybeSwitchMemtable should aquire the writeLock; see that method for the full explanation.
	*
	* (Enabling fairness in the RRWL is observed to decrease throughput, so we leave it off.)
	*/
	static final ReentrantReadWriteLock switchLock = new ReentrantReadWriteLock();

	// It is possible to call Table.open without a running daemon, so it makes sense to ensure
	// proper directories here as well as in CassandraDaemon.
	static
	{
	if (!StorageService.instance.isClientMode())
	{
	try
	{
	DatabaseDescriptor.createAllDirectories();
	}
	catch (IOException ex)
	{
	throw new IOError(ex);
	}
	}
	}

	/* Table name. */
	public final String name;
	/* ColumnFamilyStore per column family */
	private final Map<Integer, ColumnFamilyStore> columnFamilyStores = new ConcurrentHashMap<Integer, ColumnFamilyStore>();
	private final Object[] indexLocks;
	private volatile AbstractReplicationStrategy replicationStrategy;

	public static Table open(String table)
	{
	return open(table, Schema.instance);
	}

	public static Table open(String table, Schema schema)
	{
	Table tableInstance = schema.getTableInstance(table);

	if (tableInstance == null)
	{
	// instantiate the Table. we could use putIfAbsent but it's important to making sure it is only done once
	// per keyspace, so we synchronize and re-check before doing it.
	synchronized (Table.class)
	{
	tableInstance = schema.getTableInstance(table);
	if (tableInstance == null)
	{
	// open and store the table
	tableInstance = new Table(table);
	schema.storeTableInstance(tableInstance);

	//table has to be constructed and in the cache before cacheRow can be called
	for (ColumnFamilyStore cfs : tableInstance.getColumnFamilyStores())
	cfs.initCaches();
	}
	}
	}
	return tableInstance;
	}

	public static Table clear(String table) throws IOException
	{
	return clear(table, Schema.instance);
	}

	public static Table clear(String table, Schema schema) throws IOException
	{
	synchronized (Table.class)
	{
	Table t = schema.removeTableInstance(table);
	if (t != null)
	{
	for (ColumnFamilyStore cfs : t.getColumnFamilyStores())
	t.unloadCf(cfs);
	}
	return t;
	}
	}

	public Collection<ColumnFamilyStore> getColumnFamilyStores()
	{
	return Collections.unmodifiableCollection(columnFamilyStores.values());
	}

	public ColumnFamilyStore getColumnFamilyStore(String cfName)
	{
	Integer id = Schema.instance.getId(name, cfName);
	if (id == null)
	throw new IllegalArgumentException(String.format("Unknown table/cf pair (%s.%s)", name, cfName));
	return getColumnFamilyStore(id);
	}

	public ColumnFamilyStore getColumnFamilyStore(Integer id)
	{
	ColumnFamilyStore cfs = columnFamilyStores.get(id);
	if (cfs == null)
	throw new IllegalArgumentException("Unknown CF " + id);
	return cfs;
	}

	/**
	* Do a cleanup of keys that do not belong locally.
	*/
	public void forceCleanup(NodeId.OneShotRenewer renewer) throws IOException, ExecutionException, InterruptedException
	{
	if (name.equals(SYSTEM_TABLE))
	throw new UnsupportedOperationException("Cleanup of the system table is neither necessary nor wise");

	// Sort the column families in order of SSTable size, so cleanup of smaller CFs
	// can free up space for larger ones
	List<ColumnFamilyStore> sortedColumnFamilies = new ArrayList<ColumnFamilyStore>(columnFamilyStores.values());
	Collections.sort(sortedColumnFamilies, new Comparator<ColumnFamilyStore>()
	{
	// Compare first on size and, if equal, sort by name (arbitrary & deterministic).
	public int compare(ColumnFamilyStore cf1, ColumnFamilyStore cf2)
	{
	long diff = (cf1.getTotalDiskSpaceUsed() - cf2.getTotalDiskSpaceUsed());
	if (diff > 0)
	return 1;
	if (diff < 0)
	return -1;
	return cf1.columnFamily.compareTo(cf2.columnFamily);
	}
	});

	// Cleanup in sorted order to free up space for the larger ones
	for (ColumnFamilyStore cfs : sortedColumnFamilies)
	cfs.forceCleanup(renewer);
	}

	/**
	* Take a snapshot of the entire set of column families with a given timestamp
	*
	* @param snapshotName the tag associated with the name of the snapshot. This value may not be null
	*/
	public void snapshot(String snapshotName)
	{
	assert snapshotName != null;
	for (ColumnFamilyStore cfStore : columnFamilyStores.values())
	cfStore.snapshot(snapshotName);
	}

	/**
	* @param clientSuppliedName may be null.
	* @return
	*/
	public static String getTimestampedSnapshotName(String clientSuppliedName)
	{
	String snapshotName = Long.toString(System.currentTimeMillis());
	if (clientSuppliedName != null && !clientSuppliedName.equals(""))
	{
	snapshotName = snapshotName + "-" + clientSuppliedName;
	}
	return snapshotName;
	}

	/**?
	* Clear snapshots for this table. If no tag is given we will clear all
	* snapshots
	*
	* @param snapshotName the user supplied snapshot name
	* @return true if the snapshot exists
	*/
	public boolean snapshotExists(String snapshotName)
	{
	for (String dataDirPath : DatabaseDescriptor.getAllDataFileLocations())
	{
	String snapshotPath = dataDirPath + File.separator + name + File.separator + SNAPSHOT_SUBDIR_NAME + File.separator + snapshotName;
	File snapshot = new File(snapshotPath);
	if (snapshot.exists())
	{
	return true;
	}
	}
	return false;
	}

	/**
	* Clear all the snapshots for a given table.
	*/
	public void clearSnapshot(String tag) throws IOException
	{
	for (String dataDirPath : DatabaseDescriptor.getAllDataFileLocations())
	{
	// If tag is empty we will delete the entire snapshot directory
	String snapshotPath = dataDirPath + File.separator + name + File.separator + SNAPSHOT_SUBDIR_NAME + File.separator + tag;
	File snapshotDir = new File(snapshotPath);
	if (snapshotDir.exists())
	{
	if (logger.isDebugEnabled())
	logger.debug("Removing snapshot directory " + snapshotPath);
	FileUtils.deleteRecursive(snapshotDir);
	}
	}
	}

	/**
	* @return A list of open SSTableReaders
	*/
	public List<SSTableReader> getAllSSTables()
	{
	List<SSTableReader> list = new ArrayList<SSTableReader>();
	for (ColumnFamilyStore cfStore : columnFamilyStores.values())
	list.addAll(cfStore.getSSTables());
	return list;
	}

	private Table(String table)
	{
	name = table;
	KSMetaData ksm = Schema.instance.getKSMetaData(table);
	assert ksm != null : "Unknown keyspace " + table;
	try
	{
	createReplicationStrategy(ksm);
	}
	catch (ConfigurationException e)
	{
	throw new RuntimeException(e);
	}

	indexLocks = new Object[DatabaseDescriptor.getConcurrentWriters() * 128];
	for (int i = 0; i < indexLocks.length; i++)
	indexLocks[i] = new Object();
	// create data directories.
	for (String dataDir : DatabaseDescriptor.getAllDataFileLocations())
	{
	try
	{
	String keyspaceDir = dataDir + File.separator + table;
	if (!StorageService.instance.isClientMode())
	FileUtils.createDirectory(keyspaceDir);

	// remove the deprecated streaming directory.
	File streamingDir = new File(keyspaceDir, "stream");
	if (streamingDir.exists())
	FileUtils.deleteRecursive(streamingDir);
	}
	catch (IOException ex)
	{
	throw new IOError(ex);
	}
	}

	for (CFMetaData cfm : new ArrayList<CFMetaData>(Schema.instance.getTableDefinition(table).cfMetaData().values()))
	{
	logger.debug("Initializing {}.{}", name, cfm.cfName);
	initCf(cfm.cfId, cfm.cfName);
	}

	}

	public void createReplicationStrategy(KSMetaData ksm) throws ConfigurationException
	{
	if (replicationStrategy != null)
	StorageService.instance.getTokenMetadata().unregister(replicationStrategy);

	replicationStrategy = AbstractReplicationStrategy.createReplicationStrategy(ksm.name,
	ksm.strategyClass,
	StorageService.instance.getTokenMetadata(),
	DatabaseDescriptor.getEndpointSnitch(),
	ksm.strategyOptions);
	}

	// best invoked on the compaction mananger.
	public void dropCf(Integer cfId) throws IOException
	{
	assert columnFamilyStores.containsKey(cfId);
	ColumnFamilyStore cfs = columnFamilyStores.remove(cfId);
	if (cfs == null)
	return;

	unloadCf(cfs);
	}

	// disassociate a cfs from this table instance.
	private void unloadCf(ColumnFamilyStore cfs) throws IOException
	{
	try
	{
	cfs.forceBlockingFlush();
	}
	catch (ExecutionException e)
	{
	throw new IOException(e);
	}
	catch (InterruptedException e)
	{
	throw new IOException(e);
	}
	cfs.invalidate();
	}

	/** adds a cf to internal structures, ends up creating disk files). */
	public void initCf(Integer cfId, String cfName)
	{
	assert !columnFamilyStores.containsKey(cfId) : String.format("tried to init %s as %s, but already used by %s",
	cfName, cfId, columnFamilyStores.get(cfId));
	columnFamilyStores.put(cfId, ColumnFamilyStore.createColumnFamilyStore(this, cfName));
	}

	public Row getRow(QueryFilter filter) throws IOException
	{
	ColumnFamilyStore cfStore = getColumnFamilyStore(filter.getColumnFamilyName());
	ColumnFamily columnFamily = cfStore.getColumnFamily(filter, ArrayBackedSortedColumns.factory());
	return new Row(filter.key, columnFamily);
	}

	/**
	* This method adds the row to the Commit Log associated with this table.
	* Once this happens the data associated with the individual column families
	* is also written to the column family store's memtable.
	*/
	public void apply(RowMutation mutation, boolean writeCommitLog) throws IOException
	{
	if (logger.isDebugEnabled())
	logger.debug("applying mutation of row {}", ByteBufferUtil.bytesToHex(mutation.key()));

	// write the mutation to the commitlog and memtables
	switchLock.readLock().lock();
	try
	{
	if (writeCommitLog)
	CommitLog.instance.add(mutation);

	DecoratedKey<?> key = StorageService.getPartitioner().decorateKey(mutation.key());
	for (ColumnFamily cf : mutation.getColumnFamilies())
	{
	ColumnFamilyStore cfs = columnFamilyStores.get(cf.id());
	if (cfs == null)
	{
	logger.error("Attempting to mutate non-existant column family " + cf.id());
	continue;
	}

	SortedSet<ByteBuffer> mutatedIndexedColumns = null;
	for (ByteBuffer column : cfs.indexManager.getIndexedColumns())
	{
	if (cf.getColumnNames().contains(column) \|\| cf.isMarkedForDelete())
	{
	if (mutatedIndexedColumns == null)
	mutatedIndexedColumns = new TreeSet<ByteBuffer>();
	mutatedIndexedColumns.add(column);
	if (logger.isDebugEnabled())
	{
	// can't actually use validator to print value here, because we overload value
	// for deletion timestamp as well (which may not be a well-formed value for the column type)
	ByteBuffer value = cf.getColumn(column) == null ? null : cf.getColumn(column).value(); // may be null on row-level deletion
	logger.debug(String.format("mutating indexed column %s value %s",
	cf.getComparator().getString(column),
	value == null ? "null" : ByteBufferUtil.bytesToHex(value)));
	}
	}
	}

	// Sharding the lock is insufficient to avoid contention when there is a "hot" row, e.g., for
	// hint writes when a node is down (keyed by target IP). So it is worth special-casing the
	// no-index case to avoid the synchronization.
	if (mutatedIndexedColumns == null)
	{
	cfs.apply(key, cf);
	continue;
	}
	// else mutatedIndexedColumns != null
	synchronized (indexLockFor(mutation.key()))
	{
	// with the raw data CF, we can just apply every update in any order and let
	// read-time resolution throw out obsolete versions, thus avoiding read-before-write.
	// but for indexed data we need to make sure that we're not creating index entries
	// for obsolete writes.
	ColumnFamily oldIndexedColumns = readCurrentIndexedColumns(key, cfs, mutatedIndexedColumns);
	logger.debug("Pre-mutation index row is {}", oldIndexedColumns);
	ignoreObsoleteMutations(cf, mutatedIndexedColumns, oldIndexedColumns);

	cfs.apply(key, cf);

	// ignore full index memtables -- we flush those when the "master" one is full
	cfs.indexManager.applyIndexUpdates(mutation.key(), cf, mutatedIndexedColumns, oldIndexedColumns);
	}
	}
	}
	finally
	{
	switchLock.readLock().unlock();
	}

	}

	private static void ignoreObsoleteMutations(ColumnFamily cf, SortedSet<ByteBuffer> mutatedIndexedColumns, ColumnFamily oldIndexedColumns)
	{
	// DO NOT modify the cf object here, it can race w/ the CL write (see https://issues.apache.org/jira/browse/CASSANDRA-2604)

	if (oldIndexedColumns == null)
	return;

	for (Iterator<ByteBuffer> iter = mutatedIndexedColumns.iterator(); iter.hasNext(); )
	{
	ByteBuffer name = iter.next();
	IColumn newColumn = cf.getColumn(name); // null == row delete or it wouldn't be marked Mutated
	if (newColumn != null && cf.isMarkedForDelete())
	{
	// row is marked for delete, but column was also updated. if column is timestamped less than
	// the row tombstone, treat it as if it didn't exist. Otherwise we don't care about row
	// tombstone for the purpose of the index update and we can proceed as usual.
	if (newColumn.timestamp() <= cf.getMarkedForDeleteAt())
	{
	// don't remove from the cf object; that can race w/ CommitLog write. Leaving it is harmless.
	newColumn = null;
	}
	}
	IColumn oldColumn = oldIndexedColumns.getColumn(name);

	// deletions are irrelevant to the index unless we're changing state from live -> deleted, i.e.,
	// just updating w/ a newer tombstone doesn't matter
	boolean bothDeleted = (newColumn == null \|\| newColumn.isMarkedForDelete())
	&& (oldColumn == null \|\| oldColumn.isMarkedForDelete());
	// obsolete means either the row or the column timestamp we're applying is older than existing data
	boolean obsoleteRowTombstone = newColumn == null && oldColumn != null && cf.getMarkedForDeleteAt() < oldColumn.timestamp();
	boolean obsoleteColumn = newColumn != null && (newColumn.timestamp() <= oldIndexedColumns.getMarkedForDeleteAt()
	\|\| (oldColumn != null && oldColumn.reconcile(newColumn) == oldColumn));
	if (bothDeleted \|\| obsoleteRowTombstone \|\| obsoleteColumn)
	{
	if (logger.isDebugEnabled())
	logger.debug("skipping index update for obsolete mutation of " + cf.getComparator().getString(name));
	iter.remove();
	oldIndexedColumns.remove(name);
	}
	}
	}

	private static ColumnFamily readCurrentIndexedColumns(DecoratedKey<?> key, ColumnFamilyStore cfs, SortedSet<ByteBuffer> mutatedIndexedColumns)
	{
	QueryFilter filter = QueryFilter.getNamesFilter(key, new QueryPath(cfs.getColumnFamilyName()), mutatedIndexedColumns);
	return cfs.getColumnFamily(filter);
	}

	public AbstractReplicationStrategy getReplicationStrategy()
	{
	return replicationStrategy;
	}

	public static void indexRow(DecoratedKey<?> key, ColumnFamilyStore cfs, SortedSet<ByteBuffer> indexedColumns)
	{
	if (logger.isDebugEnabled())
	logger.debug("Indexing row {} ", cfs.metadata.getKeyValidator().getString(key.key));

	switchLock.readLock().lock();
	try
	{
	synchronized (cfs.table.indexLockFor(key.key))
	{
	ColumnFamily cf = readCurrentIndexedColumns(key, cfs, indexedColumns);
	if (cf != null)
	try
	{
	cfs.indexManager.applyIndexUpdates(key.key, cf, cf.getColumnNames(), null);
	}
	catch (IOException e)
	{
	throw new IOError(e);
	}
	}
	}
	finally
	{
	switchLock.readLock().unlock();
	}
	}

	private Object indexLockFor(ByteBuffer key)
	{
	return indexLocks[Math.abs(key.hashCode() % indexLocks.length)];
	}

	public List<Future<?>> flush() throws IOException
	{
	List<Future<?>> futures = new ArrayList<Future<?>>();
	for (Integer cfId : columnFamilyStores.keySet())
	{
	Future<?> future = columnFamilyStores.get(cfId).forceFlush();
	if (future != null)
	futures.add(future);
	}
	return futures;
	}

	public String getDataFileLocation(long expectedSize)
	{
	String path = DatabaseDescriptor.getDataFileLocationForTable(name, expectedSize);
	// Requesting GC has a chance to free space only if we're using mmap and a non SUN jvm
	if (path == null
	&& (DatabaseDescriptor.getDiskAccessMode() == Config.DiskAccessMode.mmap \|\| DatabaseDescriptor.getIndexAccessMode() == Config.DiskAccessMode.mmap)
	&& !MmappedSegmentedFile.isCleanerAvailable())
	{
	StorageService.instance.requestGC();
	// retry after GCing has forced unmap of compacted SSTables so they can be deleted
	// Note: GCInspector will do this already, but only sun JVM supports GCInspector so far
	SSTableDeletingTask.rescheduleFailedTasks();
	try
	{
	Thread.sleep(10000);
	}
	catch (InterruptedException e)
	{
	throw new AssertionError(e);
	}
	path = DatabaseDescriptor.getDataFileLocationForTable(name, expectedSize);
	}
	return path;
	}

	public static String getSnapshotPath(String dataDirPath, String tableName, String snapshotName)
	{
	return getSnapshotPath(dataDirPath + File.separator + tableName, snapshotName);
	}

	public static String getSnapshotPath(String tableDirectory, String snapshotName)
	{
	return tableDirectory + File.separator + SNAPSHOT_SUBDIR_NAME + File.separator + snapshotName;
	}

	public static Iterable<Table> all()
	{
	Function<String, Table> transformer = new Function<String, Table>()
	{
	public Table apply(String tableName)
	{
	return Table.open(tableName);
	}
	};
	return Iterables.transform(Schema.instance.getTables(), transformer);
	}

	@Override
	public String toString()
	{
	return getClass().getSimpleName() + "(name='" + name + "')";
	}
	}