scripts/utils/sample.dml - systemds - Git at Google

 #-------------------------------------------------------------
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 #
 #-------------------------------------------------------------

 # Randomly sample data (without replacement) into disjoint subsets.
 # The sizes of the subsets are specified in terms of fractions, stored
 # as a 1-column vector in a separate input file (see parameter 'sv').
 #
 # Parameters:
 #    X    : (input) input data set: filename of input data set
 #    sv   : (input) splitting vector: filename of 1-column vector with
 #           fractions. sum(sv) must be less than or equal to 1
 #               e.g. sv = [0.2]: Draw a 20% simple random sample
 #                    without replacement.
 #               e.g. sv = [0.25,0.25,0.25,0.25]: Randomly split data
 #                    into 4 approximately equal-sized disjoint subsets.
 #               e.g. sv = [0.5,0.3,0.2]: Randomly split data into 3
 #                    disjoint subsets that contain roughly 50%, 30%
 #                    and 20% of original data, respectively.
 #    O    : (output) output folder name. The output subsets are stored
 #           in subfolders named by consecutive integers: $O/1, $O/2,
 #           ..., $O/#subsets
 #    ofmt : (output, default "binary") format of output file. Other
 #           valid options are: "csv" and "text"
 #
 # Example:
 #   printf "0.8\n0.2" | hadoop fs -put - /tmp/sv.csv
 #   echo '{"data_type": "matrix", "value_type": "double", "rows": 2, "cols": 1, "format": "csv"}' | hadoop fs -put - /tmp/sv.csv.mtd
 #   hadoop jar SystemDS.jar -f ./scripts/utils/sample.dml -nvargs X=/tmp/X.mtx sv=/tmp/sv.csv O=/tmp/Out ofmt=csv

 # set defaults
 ofmt = ifdef($ofmt, "binary");

 # Read inputs
 X = read ($X);         # X: dataset
 sv = read ($sv);       # sv: splitting fraction vector

 # Construct sampling lower/upper bounds for samples using prefix sum
 R = rand(rows=nrow(X), cols=1, min=0.0, max=1.0, pdf = "uniform");
 svLowBnd = cumsum(sv) - sv;
 svUpBnd = cumsum(sv);

 # Construct sampling matrix SM, and apply to create samples
 parfor ( i in 1:nrow(sv))
 {
   T1 = R <= as.scalar(svUpBnd[i,1]);
   T2 = R > as.scalar(svLowBnd[i,1]);
   SM = T1 * T2;
   P = removeEmpty(target=diag(SM), margin="rows");
   iX = P %*% X;
   write (iX, $O + "/" + i, format=ofmt);
 }
	#-------------------------------------------------------------
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	#
	#-------------------------------------------------------------

	# Randomly sample data (without replacement) into disjoint subsets.
	# The sizes of the subsets are specified in terms of fractions, stored
	# as a 1-column vector in a separate input file (see parameter 'sv').
	#
	# Parameters:
	# X : (input) input data set: filename of input data set
	# sv : (input) splitting vector: filename of 1-column vector with
	# fractions. sum(sv) must be less than or equal to 1
	# e.g. sv = [0.2]: Draw a 20% simple random sample
	# without replacement.
	# e.g. sv = [0.25,0.25,0.25,0.25]: Randomly split data
	# into 4 approximately equal-sized disjoint subsets.
	# e.g. sv = [0.5,0.3,0.2]: Randomly split data into 3
	# disjoint subsets that contain roughly 50%, 30%
	# and 20% of original data, respectively.
	# O : (output) output folder name. The output subsets are stored
	# in subfolders named by consecutive integers: $O/1, $O/2,
	# ..., $O/#subsets
	# ofmt : (output, default "binary") format of output file. Other
	# valid options are: "csv" and "text"
	#
	# Example:
	# printf "0.8\n0.2" \| hadoop fs -put - /tmp/sv.csv
	# echo '{"data_type": "matrix", "value_type": "double", "rows": 2, "cols": 1, "format": "csv"}' \| hadoop fs -put - /tmp/sv.csv.mtd
	# hadoop jar SystemDS.jar -f ./scripts/utils/sample.dml -nvargs X=/tmp/X.mtx sv=/tmp/sv.csv O=/tmp/Out ofmt=csv

	# set defaults
	ofmt = ifdef($ofmt, "binary");

	# Read inputs
	X = read ($X); # X: dataset
	sv = read ($sv); # sv: splitting fraction vector

	# Construct sampling lower/upper bounds for samples using prefix sum
	R = rand(rows=nrow(X), cols=1, min=0.0, max=1.0, pdf = "uniform");
	svLowBnd = cumsum(sv) - sv;
	svUpBnd = cumsum(sv);

	# Construct sampling matrix SM, and apply to create samples
	parfor ( i in 1:nrow(sv))
	{
	T1 = R <= as.scalar(svUpBnd[i,1]);
	T2 = R > as.scalar(svLowBnd[i,1]);
	SM = T1 * T2;
	P = removeEmpty(target=diag(SM), margin="rows");
	iX = P %*% X;
	write (iX, $O + "/" + i, format=ofmt);
	}