src/ports/postgres/modules/svm/kernel_approximation.py_in - madlib - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 from __future__ import division

 import plpy

 from utilities.utilities import unique_string, num_features, _cast_if_null

 import collections
 import functools
 from math import sqrt, pi, log, factorial
 import operator
 from random import random, seed


 PolyRandOperator = collections.namedtuple(
     'PolyRandOperator', 'weights, coefs, reps, other_features, rd_id, rd_val')


 class LinearKernel(object):
     """ Simple no-op kernel that has functionality to add an intercept to the
     feature list during transformation.
     """
     def __init__(self, schema_madlib,
                  create_view=True, fit_intercept=True, **kwargs):
         self.schema_madlib = schema_madlib
         self.kernel_func = 'linear'
         self.fit_intercept = fit_intercept
         self.create_view = create_view
         self.transformed_table = None
         self.original_table = None

     def clear(self):
         if self.transformed_table:
             data_type = 'view' if self.create_view else 'table'
             plpy.execute("DROP {data_type} IF EXISTS {rel} CASCADE".
                          format(data_type=data_type,
                                 rel=self.transformed_table['source_table']))

     def save_as(self, _):
         # nothing to save in a linear kernel
         pass

     @classmethod
     def _get_default_params(cls):
         return {'fit_intercept': False}

     @classmethod
     def create(cls, schema_madlib, params=None):
         if not params:
             params = cls._get_default_params()
         return cls(schema_madlib, **params)

     @property
     def kernel_params(self):
         return ('fit_intercept={fit_intercept}'
                 .format(fit_intercept=self.fit_intercept))

     def fit(self, _):
         self.clear()
         return self

     def transform(self, source_table, independent_varname,
                   dependent_varname=None, grouping_col=None, id_col=None,
                   transformed_name='linear_transformed'):
         self.original_table = dict(source_table=source_table,
                                    independent_varname=independent_varname,
                                    dependent_varname=dependent_varname)
         self.transformed_table = None
         if self.fit_intercept:
             schema_madlib = self.schema_madlib
             data_type = 'VIEW' if self.create_view else 'TABLE'
             id_col = _cast_if_null(id_col, unique_string('id_col'))
             grouping_col = _cast_if_null(grouping_col, unique_string('grp_col'))
             dependent_varname = _cast_if_null(dependent_varname)
             features_col = unique_string(desp='features_col')
             target_col = unique_string(desp='target_col')
             transformed_rel = unique_string(desp='source_copied')
             intercept_str = "NULL" if not self.fit_intercept else "ARRAY[1]::float[]"
             run_sql = """
                 DROP {data_type} IF EXISTS {transformed_rel};
                 CREATE {data_type} {transformed_rel} AS
                     SELECT
                         array_cat({independent_varname}, {intercept_str})::float[] as {features_col},
                         {dependent_varname} as {target_col},
                         {id_col},
                         {grouping_col}
                     FROM {source_table}
                     WHERE NOT {schema_madlib}.array_contains_null({independent_varname})
                 """.format(**locals())
             plpy.execute(run_sql)
             self.transformed_table = dict(source_table=transformed_rel,
                                           dependent_varname=target_col,
                                           independent_varname=features_col)
         return self


 class PolyKernel(object):
     """docstring for PolyKernel"""
     def __init__(self, schema_madlib, degree=2, coef0=1, n_components=100,
                  random_state=1, poly_operator=None, orig_data=None,
                  fit_intercept=True, **kwargs):
         self.schema_madlib = schema_madlib
         self.kernel_func = 'polynomial'
         self.degree = degree
         self.coef0 = coef0
         self.n_components = n_components
         self.random_state = random_state
         self.fit_intercept = fit_intercept
         # polynomial random mapping operator
         self.pro = poly_operator
         self.orig_data = orig_data
         if self.pro is not None:
             pro = self.pro
             self.n_components = num_features(pro.coefs, pro.rd_val)
             self.n_components += num_features(pro.other_features, pro.rd_val)

     def clear(self):
         data_type = 'view' if self.orig_data else 'table'
         if self.pro:
             run_sql = """
                 drop {data_type} if exists {pro.weights};
                 drop {data_type} if exists {pro.coefs};
                 drop {data_type} if exists {pro.reps};
                 drop {data_type} if exists {pro.other_features};
             """.format(pro=self.pro, data_type=data_type)
             plpy.execute(run_sql)

     def save_as(self, name):
         if self.orig_data:
             plpy.warning("Polynomial Kernel Warning: no need to save."
                          "Original data table exists: {0}"
                          .format(self.orig_data))
             return
         run_sql = """
             create table {name} as
                 select {pro.rd_id} as id, {pro.rd_val} as val,
                        'coefs' as desp
                 from {pro.coefs}
                 union
                 select {pro.rd_id} as id, {pro.rd_val} as val,
                        'weights' as desp
                 from {pro.weights}
                 union
                 select {pro.rd_id} as id, {pro.rd_val} as val,
                        'reps' as desp
                 from {pro.reps}
                 union
                 select {pro.rd_id} as id, {pro.rd_val} as val,
                        'other_features' as desp
                 from {pro.other_features}
         """.format(name=name, pro=self.pro)
         plpy.execute(run_sql)

     @classmethod
     def create(cls, schema_madlib, n_features, params=None):
         if not params:
             params = cls._get_default_params(n_features)
         return cls(schema_madlib, **params)

     @classmethod
     def load_from(cls, schema_madlib, data, params=None):
         other_features = unique_string(desp='other_features')
         rd_weights = unique_string(desp='random_weights')
         rd_coefs = unique_string(desp='random_coefs')
         rd_reps = unique_string(desp='random_reps')
         rd_val = unique_string(desp='val')
         rd_id = unique_string(desp='id')
         if not params:
             params = cls._get_default_params()
         plpy.execute("""
                 drop view if exists {rd_weights};
                 create temp view {rd_weights} as
                     select id as {rd_id}, val as {rd_val} from {data}
                     where desp = 'weights';

                 drop view if exists {rd_coefs};
                 create temp view {rd_coefs} as
                     select id as {rd_id}, val as {rd_val} from {data}
                     where desp = 'coefs';

                 drop view if exists {rd_reps};
                 create temp view {rd_reps} as
                     select id as {rd_id}, val as {rd_val} from {data}
                     where desp = 'reps';

                 drop view if exists {other_features};
                 create temp view {other_features} as
                     select id as {rd_id}, val as {rd_val} from {data}
                     where desp = 'other_features';
                      """.format(**locals()))
         pro = PolyRandOperator(weights=rd_weights, coefs=rd_coefs,
                                reps=rd_reps, other_features=other_features,
                                rd_id=rd_id, rd_val=rd_val)

         return cls(schema_madlib, poly_operator=pro, orig_data=data, **params)

     @classmethod
     def _get_default_params(cls, n_features=10):
         return {
             'n_components': 2 * n_features,
             'fit_intercept': False,
             'random_state': 1,
             'degree': 3,
             'coef0': 1,
         }

     @property
     def kernel_params(self):
         return ('degree={self.degree}, coef0={self.coef0}, '
                 'n_components={self.n_components}, '
                 'random_state={self.random_state}, '
                 'fit_intercept={self.fit_intercept}'
                 .format(self=self))

     def fit(self, n_features):
         # fast way to compute nCr
         # combinations and permutations
         def ncr(n, r):
             r = min(r, n-r)
             if r == 0:
                 return 1
             numer = functools.reduce(operator.mul, range(n, n-r, -1))
             denom = factorial(r + 1)
             return numer // denom

         # Maclaurin expansion of f = (q + x)**r
         def maclaurin_coefs(r, q, k):
             if q == 0:
                 return 0.
             return ncr(r, k)*(q**(r-k))

         self.clear()
         self.orig_data = None
         coefs_ = [sqrt(maclaurin_coefs(self.degree, self.coef0, k)*(2**(k+1)))
                   for k in range(self.degree+1)]
         seed(self.random_state)
         reps_ = [int(log((1./random()), 2)) for _ in range(self.n_components)]
         reps_nz_ = [x for x in reps_ if (x > 0) and (x <= self.degree)]
         rd_val_ = unique_string(desp='val')
         rd_id_ = unique_string(desp='id')
         rd_weights_ = unique_string(desp='random_weights')
         run_sql = """
             drop table if exists {rd_weights};
             select {schema_madlib}.matrix_random(
                         1, {dim},
                         'upper=1, lower=-1, seed={seed}, temp_out=true',
                         'bernoulli', '{rd_weights}',
                         'row={id}, val={val}')
         """.format(rd_weights=rd_weights_,
                    dim=sum(reps_nz_)*n_features,
                    seed=self.random_state,
                    schema_madlib=self.schema_madlib,
                    val=rd_val_, id=rd_id_)
         plpy.execute(run_sql)

         vals_ = [coefs_[k] for k in reps_nz_]
         rd_coefs_ = unique_string(desp='rd_coefs')
         run_sql = """
             drop table if exists {data};
             create temp table {data} as
                 select
                     $1 as {val}, id as {id}
                 from generate_series(1, 1) as id
         """.format(data=rd_coefs_, val=rd_val_, id=rd_id_)
         plpy.execute(plpy.prepare(run_sql, ["float[]"]), [vals_])

         rd_reps_ = unique_string(desp='reps_nz')
         run_sql = """
             drop table if exists {data};
             create temp table {data} as
                 select
                     $1 as {val}, id as {id}
                 from generate_series(1, 1) as id
         """.format(data=rd_reps_,
                    val=rd_val_, id=rd_id_)
         plpy.execute(plpy.prepare(run_sql, ["float[]"]), [reps_nz_])

         vals_ = ([coefs_[0]]*len([_ for _ in reps_ if _ == 0]) +
                  [0]*len([_ for _ in reps_ if _ > self.degree]))
         other_features_ = unique_string(desp='other_features')
         run_sql = """
             drop table if exists {data};
             create temp table {data} as
                 select
                     $1 as {val}, id as {id}
                 from generate_series(1, 1) as id
         """.format(data=other_features_,
                    val=rd_val_, id=rd_id_)
         plpy.execute(plpy.prepare(run_sql, ["float[]"]), [vals_])

         self.pro = PolyRandOperator(weights=rd_weights_,
                                     coefs=rd_coefs_, reps=rd_reps_,
                                     other_features=other_features_,
                                     rd_id=rd_id_, rd_val=rd_val_)
         return self

     def transform(self, source_table, independent_varname,
                   dependent_varname=None, grouping_col=None, id_col=None,
                   transformed_name='poly_transformed'):
         if not self.pro:
             return self
         self.original_table = dict(source_table=source_table,
                                    independent_varname=independent_varname,
                                    dependent_varname=dependent_varname)
         schema_madlib = self.schema_madlib
         grouping_col = _cast_if_null(grouping_col, unique_string('grp_col'))
         dependent_varname = _cast_if_null(dependent_varname, '')
         id_col = _cast_if_null(id_col, unique_string('id_col'))

         features_col = unique_string(desp='features_col')
         target_col = unique_string(desp='target_col')
         transformed = unique_string(desp=transformed_name)
         intercept = "NULL" if not self.fit_intercept else "ARRAY[1]::float[]"
         # X = a * cos (X*C + b)
         pro, multiplier = self.pro, sqrt(1. / self.n_components)
         run_sql = """
         drop table if exists {transformed};
         create temp table {transformed} as
             select
                 array_cat(
                     {schema_madlib}.array_scalar_mult(
                         array_cat(
                             {schema_madlib}.array_mult(
                                 {schema_madlib}.__row_fold(
                                     {schema_madlib}.__matrix_vec_mult_in_mem(
                                         q.{features_col}::float[],
                                         weights.{pro.rd_val}::float[]
                                     )::float[],
                                     reps.{pro.rd_val}::integer[]
                                 )::float[],
                                 coefs.{pro.rd_val}::float[]
                             )::float[],
                             of.{pro.rd_val}::float[]
                         )::float[],
                         {multiplier}::float
                     )::float[],
                     {intercept}
                 ) as {features_col},
                 q.{target_col} as {target_col},
                 {id_col},
                 {grouping_col}
             from (
                 select
                     {dependent_varname} as {target_col},
                     {independent_varname} as {features_col},
                     {id_col},
                     {grouping_col}
                 from {source_table}
                 WHERE not {schema_madlib}.array_contains_null({independent_varname})
             ) q cross join (select {pro.rd_val} from {pro.weights}) as weights
                 cross join (select {pro.rd_val} from {pro.coefs}) as coefs
                 cross join (select {pro.rd_val} from {pro.reps}) as reps
                 cross join (select {pro.rd_val} from {pro.other_features}) as of
         """.format(**locals())
         plpy.execute(run_sql)
         # assert(self.n_components == num_features(transformed, features_col))
         self.transformed_table = dict(source_table=transformed,
                                       dependent_varname=target_col,
                                       independent_varname=features_col)
         return self


 class GaussianKernelBase(object):

     """docstring for gaussianKernel"""

     def __init__(self, schema_madlib, gamma, n_components, random_state,
                  random_weights, random_offset, id_col, val_col,
                  orig_data, fit_intercept=True, **kwargs):
         self.kernel_func = 'gaussian'
         self.gamma = gamma
         self.n_components = n_components
         # int32 seed used by boost::minstd_rand
         self.random_state = random_state
         self.fit_intercept = fit_intercept
         # random operators
         self.rd_weights = random_weights
         self.rd_offset = random_offset
         # val column in random operators
         self.rd_val = val_col
         # id column in random operators
         self.rd_id = id_col
         self.transformed_table = dict()
         self.original_table = dict()
         # indicate whether rd_weights and rd_offset is view or table
         # store the original data table name if they are view
         # None if they are table
         self.orig_data = orig_data
         self.schema_madlib = schema_madlib
         if self.rd_offset is not None:
             self.n_components = num_features(self.rd_offset, self.rd_val)

     def _random_weights(self, row_dim, col_dim, rd_id, rd_val):
         rd_weights = unique_string(desp='random_weights')
         sigma = sqrt(2 * self.gamma)
         seed = self.random_state
         plpy.execute("""
             drop table if exists {rd_weights};
             select {self.schema_madlib}.matrix_random(
                     {row_dim}, {col_dim},
                     'mu=0, sigma={sigma}, seed={seed}, temp_out=true',
                     'normal', '{rd_weights}',
                     'row={rd_id}, val={rd_val}');
         """.format(**locals()))
         return rd_weights

     def _random_offsets(self, row_dim, col_dim, rd_id, rd_val):
         rd_offset = unique_string(desp='random_offsets')
         max_ = 2 * pi
         seed = self.random_state
         plpy.execute("""
             drop table if exists {rd_offset};
             select {self.schema_madlib}.matrix_random(
                     {row_dim}, {col_dim},
                     'min=0, max={max_}, seed={seed}, temp_out=true',
                     'uniform', '{rd_offset}',
                     'row={rd_id}, val={rd_val}');
         """.format(**locals()))
         return rd_offset

     def clear(self):
         data_type = 'view' if self.orig_data else 'table'
         if self.rd_weights:
             plpy.execute("drop {data_type} if exists {data};".format(
                          data=self.rd_weights,
                          data_type=data_type))
         if self.rd_offset:
             plpy.execute("drop {data_type} if exists {data};".format(
                          data=self.rd_offset,
                          data_type=data_type))

     def save_as(self, name):
         if self.orig_data:
             plpy.warning("Gaussian Kernel Warning: no need to save."
                          "Original data table exists: {0}".
                          format(self.orig_data))
             return

         run_sql = """
             create table {name} as
                 select
                         {self.rd_id} as id, {self.rd_val} as val,
                         'offsets' as desp
                 from {self.rd_offset}
                 union
                 select
                         {self.rd_id} as id, {self.rd_val} as val,
                         'weights' as desp
                 from {self.rd_weights}
         """.format(**locals())
         plpy.execute(run_sql)

     @classmethod
     def create(cls, schema_madlib, n_features, params=None):
         if not params:
             params = cls._get_default_params(n_features)
         in_memory = params.pop('fit_in_memory', True)
         # according to the 1gb limit on each entry of the table
         n_elems = params['n_components'] * n_features
         if in_memory and n_elems <= 1e8:
             return GaussianKernelInMemory(schema_madlib, **params)
         else:
             return GaussianKernel(schema_madlib, **params)

     @classmethod
     def _get_default_params(cls, n_features=10):
         return {
             'n_components': 2 * n_features,
             'fit_intercept': False,
             'random_state': 1,
             'fit_in_memory': True,
             'gamma': 1 / n_features,
         }

     @classmethod
     def load_from(cls, schema_madlib, data, params=None):
         rd_weights = unique_string(desp='random_weights')
         rd_offset = unique_string(desp='random_offsets')
         rd_val = unique_string(desp='val')
         rd_id = unique_string(desp='id')
         plpy.execute("""
                 drop view if exists {rd_weights};
                 create temp view {rd_weights} as
                     select id as {rd_id}, val as {rd_val} from {data}
                     where desp = 'weights';

                 drop view if exists {rd_offset};
                 create temp view {rd_offset} as
                     select id as {rd_id}, val as {rd_val} from {data}
                     where desp = 'offsets';
                      """.format(**locals()))
         if not params:
             params = cls._get_default_params()
         in_memory = params.pop('fit_in_memory', True)
         if in_memory:
             return GaussianKernelInMemory(schema_madlib,
                                           random_weights=rd_weights,
                                           random_offset=rd_offset,
                                           id_col=rd_id, val_col=rd_val,
                                           orig_data=data, **params)
         else:
             return GaussianKernel(schema_madlib,
                                   random_weights=rd_weights,
                                   random_offset=rd_offset,
                                   id_col=rd_id, val_col=rd_val,
                                   orig_data=data, **params)


 class GaussianKernel(GaussianKernelBase):

     """docstring for gaussianKernel"""

     def __init__(self, schema_madlib, gamma=1, n_components=100,
                  random_state=1, random_weights=None,
                  random_offset=None, id_col=None, val_col=None,
                  orig_data=None, fit_intercept=True, **kwargs):
         params = locals()
         params.pop('self')
         super(GaussianKernel, self).__init__(**params)

     @property
     def kernel_params(self):
         return ('gamma={gamma}, n_components={n_components},'
                 'random_state={random_state}, fit_intercept={fit_intercept}, fit_in_memory=False'
                 .format(gamma=self.gamma,
                         n_components=self.n_components,
                         random_state=self.random_state,
                         fit_intercept=self.fit_intercept))

     def fit(self, n_features):
         self.clear()
         self.orig_data = None
         self.rd_val = unique_string(desp='val')
         self.rd_id = unique_string(desp='id')
         self.rd_weights = self._random_weights(n_features, self.n_components,
                                                self.rd_id, self.rd_val)
         self.rd_offset = self._random_offsets(1, self.n_components,
                                               self.rd_id, self.rd_val)
         return self

     def transform(self, source_table, independent_varname,
                   dependent_varname=None, grouping_col=None, id_col=None,
                   transformed_name='gaussian_transformed'):
         if not self.rd_offset or not self.rd_weights:
             return self
         self.original_table = dict(source_table=source_table,
                                    independent_varname=independent_varname,
                                    dependent_varname=dependent_varname)
         schema_madlib = self.schema_madlib
         grouping_col = _cast_if_null(grouping_col, unique_string('grp_col'))
         dependent_varname = _cast_if_null(dependent_varname, '')
         id_col = _cast_if_null(id_col, unique_string('id_col'))

         # copy data to the temporary table with id column
         # id_col is different from index_col
         # id_col is unique and, if any, is from the original table
         # index_col is generated randomly
         # needs to be sequential for madlib.matrix_mult to work
         source_with_id = unique_string(desp='source_copied')
         features_col = unique_string(desp='features_col')
         target_col = unique_string(desp='target_col')
         index_col = unique_string(desp='index_col')

         run_sql = """
             select setseed(0.5);
             drop table if exists {source_with_id};
             create temp table {source_with_id} as
                 select
                     row_number() over (order by random()) as {index_col},
                     {dependent_varname} as {target_col},
                     {independent_varname} as {features_col},
                     {id_col},
                     {grouping_col}
                 from {source_table}
                 WHERE not {schema_madlib}.array_contains_null({independent_varname})
         """.format(**locals())
         plpy.execute(run_sql)
         source_table = source_with_id
         dependent_varname = target_col
         independent_varname = features_col

         temp_transformed = unique_string(desp='temp_transformed')

         # X = X * weights
         run_sql = """
             drop table if exists {temp_transformed};
             select {schema_madlib}.matrix_mult(
                           '{source_table}',
                           'row={index_col}, val={independent_varname}',
                           '{self.rd_weights}',
                           'row={self.rd_id}, val={self.rd_val}',
                           '{temp_transformed}',
                           'row={index_col}, val={independent_varname}');
         """.format(**locals())
         plpy.execute(run_sql)

         transformed = unique_string(desp=transformed_name)

         # rd_offset is a vector of n_components elements
         # which will fit in memory most of the time. Plus,
         # putting rd_offset in memory makes broadcasting it to
         # segments more efficiently
         rd_offset_vals = plpy.execute("""
                       select {rd_val} as val from {rd_offset}
                       """.format(rd_val=self.rd_val,
                                  rd_offset=self.rd_offset))[0]['val']

         # X = a * cos (X + b)
         multiplier = sqrt(2. / self.n_components)
         intercept = "NULL" if not self.fit_intercept else "ARRAY[1]::float[]"
         run_sql = """
             drop table if exists {transformed};
             create temp table {transformed} as
                 select
                     array_cat({schema_madlib}.array_scalar_mult(
                                     {schema_madlib}.array_cos(
                                         q.{independent_varname}::float[])::float[],
                                     {multiplier}::float)::float[],
                               {intercept}
                              ) as {independent_varname},
                     {dependent_varname},
                     {id_col},
                     {grouping_col}
                 from (
                     select
                         x.{index_col},
                         {schema_madlib}.array_add(
                             x.{independent_varname}::float[],
                             $1) as {independent_varname}
                     from {temp_transformed} as x
                 ) q join {source_table} s using ({index_col})
         """.format(**locals())
         plpy.execute(plpy.prepare(run_sql, ['float[]']), [rd_offset_vals])
         # clear table generated from matrix mult
         plpy.execute("drop table if exists " + temp_transformed)
         self.transformed_table = dict(index_col=index_col,
                                       source_table=transformed,
                                       dependent_varname=dependent_varname,
                                       independent_varname=independent_varname)
         return self


 class GaussianKernelInMemory(GaussianKernelBase):

     """docstring for gaussianKernel"""

     def __init__(self, schema_madlib, gamma=1, n_components=100,
                  random_state=1, random_weights=None,
                  random_offset=None, id_col=None,
                  val_col=None, orig_data=None, fit_intercept=True, **kwargs):
         params = locals()
         params.pop('self')
         super(GaussianKernelInMemory, self).__init__(**params)

     @property
     def kernel_params(self):
         return ('gamma={self.gamma}, n_components={self.n_components},'
                 'random_state={self.random_state}, '
                 'fit_intercept={self.fit_intercept}, fit_in_memory=True'
                 .format(self=self))

     def fit(self, n_features):
         self.clear()
         self.orig_data = None
         self.rd_val = unique_string(desp='val')
         self.rd_id = unique_string(desp='id')
         self.rd_weights = self._random_weights(1, n_features * self.n_components,
                                                self.rd_id, self.rd_val)
         self.rd_offset = self._random_offsets(1, self.n_components,
                                               self.rd_id, self.rd_val)
         return self

     def transform(self, source_table, independent_varname,
                   dependent_varname=None, grouping_col=None, id_col=None,
                   transformed_name='gaussian_transformed'):
         if not self.rd_offset or not self.rd_weights:
             return self

         self.original_table = dict(source_table=source_table,
                                    independent_varname=independent_varname,
                                    dependent_varname=dependent_varname)

         schema_madlib = self.schema_madlib

         def _verify(x, s):
             null_str = "NULL::integer"
             if x:
                 return str(x)
             else:
                 return null_str + " as " + s if s else null_str

         grouping_col = _verify(grouping_col, unique_string('grp_col'))
         dependent_varname = _verify(dependent_varname, '')
         id_col = _verify(id_col, unique_string('id_col'))

         features_col = unique_string(desp='features_col')
         target_col = unique_string(desp='target_col')
         transformed = unique_string(desp=transformed_name)

         # X <- 1 + a * cos (X*C + b)
         multiplier = sqrt(2. / self.n_components)
         intercept = "NULL" if not self.fit_intercept else "ARRAY[1]::float[]"
         run_sql = """
             drop table if exists {transformed};
             create temp table {transformed} as
                 select
                     array_cat(
                         {schema_madlib}.array_scalar_mult(
                             {schema_madlib}.array_cos(
                                 {schema_madlib}.array_add(
                                     {schema_madlib}.__matrix_vec_mult_in_mem(
                                         q.{features_col}::float[],
                                         rw.{self.rd_val}::float[]
                                     )::float[],
                                     ro.{self.rd_val}::float[]
                                 )::float[]
                             )::float[],
                             {multiplier}::float
                         )::float[],
                         {intercept}
                     ) as {features_col},
                     q.{target_col} as {target_col},
                     {id_col},
                     {grouping_col}
                 from (
                     select
                         {dependent_varname} as {target_col},
                         {independent_varname} as {features_col},
                         {id_col},
                         {grouping_col}
                     from {source_table}
                     WHERE not {schema_madlib}.array_contains_null({independent_varname})
                 ) q
                 cross join (select {self.rd_val} from {self.rd_weights}) as rw
                 cross join (select {self.rd_val} from {self.rd_offset}) as ro
         """.format(**locals())
         plpy.execute(run_sql)
         self.transformed_table = dict(source_table=transformed,
                                       dependent_varname=target_col,
                                       independent_varname=features_col)
         return self


 def create_kernel(schema_madlib, n_features, kernel_func, kernel_params_dict):
     if kernel_func == 'linear':
         return LinearKernel.create(schema_madlib, kernel_params_dict)
     elif kernel_func == 'gaussian':
         return GaussianKernelBase.create(schema_madlib, n_features, kernel_params_dict)
     elif kernel_func == 'polynomial':
         return PolyKernel.create(schema_madlib, n_features, kernel_params_dict)


 def load_kernel(schema_madlib, data, kernel_func, kernel_params_dict):
     if kernel_func == 'linear':
         return LinearKernel.create(schema_madlib, kernel_params_dict)
     elif kernel_func == 'gaussian':
         return GaussianKernelBase.load_from(schema_madlib, data, kernel_params_dict)
     elif kernel_func == 'polynomial':
         return PolyKernel.load_from(schema_madlib, data, kernel_params_dict)