| { |
| "cells": [ |
| { |
| "cell_type": "code", |
| "execution_count": 2, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stderr", |
| "output_type": "stream", |
| "text": [ |
| "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated since IPython 4.0. You should import from traitlets.config instead.\n", |
| " \"You should import from traitlets.config instead.\", ShimWarning)\n", |
| "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", |
| " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" |
| ] |
| } |
| ], |
| "source": [ |
| "%load_ext sql" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 3, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "data": { |
| "text/plain": [ |
| "u'Connected: fmcquillan@madlib'" |
| ] |
| }, |
| "execution_count": 3, |
| "metadata": {}, |
| "output_type": "execute_result" |
| } |
| ], |
| "source": [ |
| "# Greenplum Database 5.x on GCP (PM demo machine) - direct external IP access\n", |
| "#%sql postgresql://gpadmin@34.67.65.96:5432/madlib\n", |
| "\n", |
| "# Greenplum Database 5.x on GCP - via tunnel\n", |
| "#%sql postgresql://gpadmin@localhost:8000/madlib\n", |
| " \n", |
| "# PostgreSQL local\n", |
| "%sql postgresql://fmcquillan@localhost:5432/madlib" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 46, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "import numpy as np\n", |
| "\n", |
| "from random import random\n", |
| "from math import log, ceil\n", |
| "from time import time, ctime\n", |
| "\n", |
| "\n", |
| "class Hyperband:\n", |
| "\n", |
| "\tdef __init__( self, get_params_function, try_params_function ):\n", |
| "\t\tself.get_params = get_params_function\n", |
| "\t\tself.try_params = try_params_function\n", |
| "\n", |
| "\t\tself.max_iter = 27 \t# maximum iterations per configuration\n", |
| "\t\tself.eta = 3\t\t\t# defines configuration downsampling rate (default = 3)\n", |
| "\n", |
| "\t\tself.logeta = lambda x: log( x ) / log( self.eta )\n", |
| "\t\tself.s_max = int( self.logeta( self.max_iter ))\n", |
| "\t\tself.B = ( self.s_max + 1 ) * self.max_iter\n", |
| "\n", |
| "\t\tself.results = []\t# list of dicts\n", |
| "\t\tself.counter = 0\n", |
| "\t\tself.best_loss = np.inf\n", |
| "\t\tself.best_counter = -1\n", |
| "\n", |
| "\n", |
| "\t# can be called multiple times\n", |
| "\tdef run( self, skip_last = 0, dry_run = False ):\n", |
| "\n", |
| "\t\tfor s in reversed( range( self.s_max + 1 )):\n", |
| " \n", |
| "\t\t\tprint (\" \") \n", |
| "\t\t\tprint (\"s = \", s)\n", |
| "\n", |
| "\t\t\t# initial number of configurations\n", |
| "\t\t\tn = int( ceil( self.B / self.max_iter / ( s + 1 ) * self.eta ** s ))\n", |
| "\n", |
| "\t\t\t# initial number of iterations per config\n", |
| "\t\t\tr = self.max_iter * self.eta ** ( -s )\n", |
| "\n", |
| "\t\t\t# n random configurations\n", |
| "\t\t\tT = [ self.get_params() for i in range( n )]\n", |
| "\n", |
| "\t\t\tfor i in range(( s + 1 ) - int( skip_last )):\t# changed from s + 1\n", |
| "\n", |
| "\t\t\t\t# Run each of the n configs for <iterations>\n", |
| "\t\t\t\t# and keep best (n_configs / eta) configurations\n", |
| "\n", |
| "\t\t\t\tn_configs = n * self.eta ** ( -i )\n", |
| "\t\t\t\tn_iterations = r * self.eta ** ( i )\n", |
| "\n", |
| "\t\t\t\tprint \"\\n*** {} configurations x {:.1f} iterations each\".format(\n", |
| "\t\t\t\t\tn_configs, n_iterations )\n", |
| "\n", |
| "\t\t\t\tval_losses = []\n", |
| "\t\t\t\tearly_stops = []\n", |
| "\n", |
| "\t\t\t\tfor t in T:\n", |
| "\n", |
| "\t\t\t\t\tself.counter += 1\n", |
| "\t\t\t\t\t#print \"\\n{} | {} | lowest loss so far: {:.4f} (run {})\\n\".format(\n", |
| "\t\t\t\t\t#\tself.counter, ctime(), self.best_loss, self.best_counter )\n", |
| "\n", |
| "\t\t\t\t\tstart_time = time()\n", |
| "\n", |
| "\t\t\t\t\tif dry_run:\n", |
| "\t\t\t\t\t\tresult = { 'loss': random(), 'log_loss': random(), 'auc': random()}\n", |
| "\t\t\t\t\telse:\n", |
| "\t\t\t\t\t\tresult = self.try_params( n_iterations, t )\t\t# <---\n", |
| "\n", |
| "\t\t\t\t\tassert( type( result ) == dict )\n", |
| "\t\t\t\t\tassert( 'loss' in result )\n", |
| "\n", |
| "\t\t\t\t\tseconds = int( round( time() - start_time ))\n", |
| "\t\t\t\t\t#print \"\\n{} seconds.\".format( seconds )\n", |
| "\n", |
| "\t\t\t\t\tloss = result['loss']\n", |
| "\t\t\t\t\tval_losses.append( loss )\n", |
| "\n", |
| "\t\t\t\t\tearly_stop = result.get( 'early_stop', False )\n", |
| "\t\t\t\t\tearly_stops.append( early_stop )\n", |
| "\n", |
| "\t\t\t\t\t# keeping track of the best result so far (for display only)\n", |
| "\t\t\t\t\t# could do it be checking results each time, but hey\n", |
| "\t\t\t\t\tif loss < self.best_loss:\n", |
| "\t\t\t\t\t\tself.best_loss = loss\n", |
| "\t\t\t\t\t\tself.best_counter = self.counter\n", |
| "\n", |
| "\t\t\t\t\tresult['counter'] = self.counter\n", |
| "\t\t\t\t\tresult['seconds'] = seconds\n", |
| "\t\t\t\t\tresult['params'] = t\n", |
| "\t\t\t\t\tresult['iterations'] = n_iterations\n", |
| "\n", |
| "\t\t\t\t\tself.results.append( result )\n", |
| "\n", |
| "\t\t\t\t# select a number of best configurations for the next loop\n", |
| "\t\t\t\t# filter out early stops, if any\n", |
| "\t\t\t\tindices = np.argsort( val_losses )\n", |
| "\t\t\t\tT = [ T[i] for i in indices if not early_stops[i]]\n", |
| "\t\t\t\tT = T[ 0:int( n_configs / self.eta )]\n", |
| "\n", |
| "\t\treturn self.results\n" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 6, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "def get_params():\n", |
| " return" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 7, |
| "metadata": {}, |
| "outputs": [], |
| "source": [ |
| "def try_params():\n", |
| " return" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 47, |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| " \n", |
| "('s = ', 3)\n", |
| "\n", |
| "*** 27 configurations x 1.0 iterations each\n", |
| "\n", |
| "*** 9.0 configurations x 3.0 iterations each\n", |
| "\n", |
| "*** 3.0 configurations x 9.0 iterations each\n", |
| "\n", |
| "*** 1.0 configurations x 27.0 iterations each\n", |
| " \n", |
| "('s = ', 2)\n", |
| "\n", |
| "*** 9 configurations x 3.0 iterations each\n", |
| "\n", |
| "*** 3.0 configurations x 9.0 iterations each\n", |
| "\n", |
| "*** 1.0 configurations x 27.0 iterations each\n", |
| " \n", |
| "('s = ', 1)\n", |
| "\n", |
| "*** 6 configurations x 9.0 iterations each\n", |
| "\n", |
| "*** 2.0 configurations x 27.0 iterations each\n", |
| " \n", |
| "('s = ', 0)\n", |
| "\n", |
| "*** 4 configurations x 27.0 iterations each\n" |
| ] |
| } |
| ], |
| "source": [ |
| "#!/usr/bin/env python\n", |
| "\n", |
| "\"bare-bones demonstration of using hyperband to tune sklearn GBT\"\n", |
| "\n", |
| "#from hyperband import Hyperband\n", |
| "#from defs.gb import get_params, try_params\n", |
| "\n", |
| "hb = Hyperband( get_params, try_params )\n", |
| "\n", |
| "# no actual tuning, doesn't call try_params()\n", |
| "results = hb.run( dry_run = True )\n", |
| "\n", |
| "#results = hb.run( skip_last = 1 ) # shorter run\n", |
| "#results = hb.run()" |
| ] |
| } |
| ], |
| "metadata": { |
| "kernelspec": { |
| "display_name": "Python 2", |
| "language": "python", |
| "name": "python2" |
| }, |
| "language_info": { |
| "codemirror_mode": { |
| "name": "ipython", |
| "version": 2 |
| }, |
| "file_extension": ".py", |
| "mimetype": "text/x-python", |
| "name": "python", |
| "nbconvert_exporter": "python", |
| "pygments_lexer": "ipython2", |
| "version": "2.7.10" |
| } |
| }, |
| "nbformat": 4, |
| "nbformat_minor": 2 |
| } |