.test-infra/jupyter/precommit_job_times.ipynb - beam - Git at Google

 {
  "cells": [
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "<!--\n",
     "#\n",
     "# Licensed to the Apache Software Foundation (ASF) under one or more\n",
     "# contributor license agreements.  See the NOTICE file distributed with\n",
     "# this work for additional information regarding copyright ownership.\n",
     "# The ASF licenses this file to You under the Apache License, Version 2.0\n",
     "# (the \"License\"); you may not use this file except in compliance with\n",
     "# the License.  You may obtain a copy of the License at\n",
     "#\n",
     "#    http://www.apache.org/licenses/LICENSE-2.0\n",
     "#\n",
     "# Unless required by applicable law or agreed to in writing, software\n",
     "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
     "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
     "# See the License for the specific language governing permissions and\n",
     "# limitations under the License.\n",
     "#\n",
     "-->\n",
     "\n",
     "# Precommit Job Times\n",
     "This notebook fetches test statistics from Jenkins.\n",
     "\n",
     "## Requirements\n",
     "\n",
     "```shell\n",
     "pip install pandas matplotlib requests\n",
     "# You may need to restart Jupyter for matplotlib to work.\n",
     "```\n",
     "\n",
     "**Note:** Requests to `builds.apache.org` must contain a ?depth= or ?tree= argument, otherwise your IP will get banned. [Policy](https://cwiki.apache.org/confluence/display/INFRA/Using+the+ASF+Jenkins+API)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
     "slideshow": {
      "slide_type": "-"
     }
    },
    "outputs": [],
    "source": [
     "import pandas as pd\n",
     "import numpy as np\n",
     "import matplotlib.pyplot as plt\n",
     "import matplotlib.dates as md\n",
     "import requests"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Fetch precommit job data from Jenkins.\n",
     "\n",
     "class Build(dict):\n",
     "    def __init__(self, job_name, json):\n",
     "        self['job_name'] = job_name\n",
     "        self['result'] = json['result']\n",
     "        self['number'] = json['number']\n",
     "        self['timestamp'] = pd.Timestamp.utcfromtimestamp(json['timestamp'] / 1000)\n",
     "        self['queuingDurationMillis'] = -1\n",
     "        self['totalDurationMillis'] = -1\n",
     "        for action in json['actions']:\n",
     "            if action.get('_class', None) == 'jenkins.metrics.impl.TimeInQueueAction':\n",
     "                self['queuingDurationMinutes'] = action['queuingDurationMillis'] / 60000.\n",
     "                self['totalDurationMinutes'] = action['totalDurationMillis'] / 60000.\n",
     "        if self['queuingDurationMinutes'] == -1:\n",
     "            raise ValueError('could not find queuingDurationMillis in: %s', json)\n",
     "        if self['totalDurationMinutes'] == -1:\n",
     "            raise ValueError('could not find totalDurationMillis in: %s', json)\n",
     "        \n",
     "# Can be 'builds' (last 50) or 'allBuilds'.\n",
     "builds_key = 'allBuilds'  \n",
     "\n",
     "builds = []\n",
     "job_names = ['beam_PreCommit_Java_Cron', 'beam_PreCommit_Python_Cron', 'beam_PreCommit_Go_Cron']\n",
     "for job_name in job_names:\n",
     "    url = 'https://builds.apache.org/job/%s/api/json' % job_name\n",
     "    params = {\n",
     "        'tree': '%s[result,number,timestamp,actions[queuingDurationMillis,totalDurationMillis]]' % builds_key}\n",
     "    r = requests.get(url, params=params)\n",
     "    data = r.json()\n",
     "    builds.extend([Build(job_name, build_json)\n",
     "                         for build_json in data[builds_key]])\n",
     "\n",
     "df = pd.DataFrame(builds)\n",
     "\n",
     "timestamp_cutoff = pd.Timestamp.utcnow().tz_convert(None) - pd.Timedelta(weeks=4)\n",
     "df_4weeks = df[df.timestamp >= timestamp_cutoff]\n",
     "timestamp_cutoff = pd.Timestamp.utcnow().tz_convert(None) - pd.Timedelta(weeks=1)\n",
     "df_1week = df[df.timestamp >= timestamp_cutoff]\n",
     "timestamp_cutoff = pd.Timestamp.utcnow().tz_convert(None) - pd.Timedelta(days=1)\n",
     "df_1day = df[df.timestamp >= timestamp_cutoff]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Graphs of precommit job times.\n",
     "\n",
     "for job_name in job_names:\n",
     "    duration_df = df_4weeks[df_4weeks.job_name == job_name]\n",
     "    duration_df = duration_df[['timestamp', 'queuingDurationMinutes', 'totalDurationMinutes']]\n",
     "    ax = duration_df.plot(x='timestamp')\n",
     "    ax.set_title(job_name)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Get 95th percentile of precommit run times.\n",
     "test_dfs = {'4 weeks': df_4weeks, '1 week': df_1week, '1 day': df_1day}\n",
     "metrics = []\n",
     "\n",
     "for sample_time, test_df in test_dfs.items():\n",
     "    for job_name in job_names:\n",
     "        df_times = test_df[test_df.job_name == job_name]\n",
     "        for percentile in [95]:\n",
     "            total_all = np.percentile(df_times.totalDurationMinutes, q=percentile)\n",
     "            total_success = np.percentile(df_times[df_times.result == 'SUCCESS'].totalDurationMinutes,\n",
     "                                          q=percentile)\n",
     "            queue = np.percentile(df_times.queuingDurationMinutes, q=percentile)\n",
     "            metrics.append({'job_name': '%s %s %dth' % (\n",
     "                                job_name.replace('beam_PreCommit_','').replace('_GradleBuild',''),\n",
     "                                sample_time, percentile),\n",
     "                            'totalDurationMinutes_all': total_all,\n",
     "                            'totalDurationMinutes_success_only': total_success,\n",
     "                            'queuingDurationMinutes': queue,\n",
     "                           })\n",
     "\n",
     "pd.DataFrame(metrics).sort_values('job_name')"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Fetch individual test data (precommit) from Jenkins.\n",
     "MAX_FETCH_PER_JOB_TYPE = 5\n",
     "\n",
     "test_results_raw = []\n",
     "for job_name in list(df.job_name.unique()):\n",
     "    if job_name == 'beam_PreCommit_Go_Cron':\n",
     "        # TODO: Go builds are missing testReport data on Jenkins.\n",
     "        continue\n",
     "    build_nums = list(df.number[df.job_name == job_name].unique())\n",
     "    num_fetched = 0\n",
     "    for build_num in build_nums:\n",
     "        url = 'https://builds.apache.org/job/%s/%s/testReport/api/json?depth=1' % (job_name, build_num)\n",
     "        print('.', end='')\n",
     "        r = requests.get(url)\n",
     "        if not r.ok:\n",
     "            # Typically a 404 means that the job is still running.\n",
     "            print('skipping (%s): %s' % (r.status_code, url))\n",
     "            continue\n",
     "        raw_result = r.json()\n",
     "        raw_result['job_name'] = job_name\n",
     "        raw_result['build_num'] = build_num\n",
     "        test_results_raw.append(raw_result)\n",
     "        \n",
     "        num_fetched += 1\n",
     "        if num_fetched >= MAX_FETCH_PER_JOB_TYPE:\n",
     "            break\n",
     "\n",
     "print(' done')"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Analyze individual test results.\n",
     "\n",
     "class TestResult(dict):\n",
     "    def __init__(self, job_name, build_num, json):\n",
     "        self['job_name'] = job_name\n",
     "        self['build_num'] = build_num\n",
     "        self['name'] = json['name']\n",
     "        self['duration'] = json['duration']\n",
     "        self['className'] = json['className']\n",
     "        self['status'] = json['status']\n",
     "\n",
     "test_results = []\n",
     "for test_result_raw in test_results_raw:\n",
     "    job_name = test_result_raw['job_name']\n",
     "    build_num = test_result_raw['build_num']\n",
     "    for suite in test_result_raw['suites']:\n",
     "        for case in suite['cases']:\n",
     "            test_results.append(TestResult(job_name, build_num, case))\n",
     "\n",
     "df_tests = pd.DataFrame(test_results)\n",
     "df_tests = df_tests.drop(columns=['build_num'])\n",
     "df_tests = df_tests.groupby(['className', 'job_name', 'name', 'status'], as_index=False).max()\n",
     "df_tests = df_tests.sort_values('duration', ascending=False)\n",
     "\n",
     "def filter_test_results(job_name, status):\n",
     "    res = df_tests\n",
     "    if job_name != 'all':\n",
     "        res = res[res.job_name == job_name]\n",
     "    if status != 'all':\n",
     "        res = res[res.status == status]\n",
     "    return res.head(n=20)\n",
     "\n",
     "from ipywidgets import interact\n",
     "interact(filter_test_results,\n",
     "         job_name=['all'] + list(df_tests.job_name.unique()),\n",
     "         status=['all'] + list(df_tests.status.unique()))"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
    "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
     "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.5.3"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"<!--\n",
	"#\n",
	"# Licensed to the Apache Software Foundation (ASF) under one or more\n",
	"# contributor license agreements. See the NOTICE file distributed with\n",
	"# this work for additional information regarding copyright ownership.\n",
	"# The ASF licenses this file to You under the Apache License, Version 2.0\n",
	"# (the \"License\"); you may not use this file except in compliance with\n",
	"# the License. You may obtain a copy of the License at\n",
	"#\n",
	"# http://www.apache.org/licenses/LICENSE-2.0\n",
	"#\n",
	"# Unless required by applicable law or agreed to in writing, software\n",
	"# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
	"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
	"# See the License for the specific language governing permissions and\n",
	"# limitations under the License.\n",
	"#\n",
	"-->\n",
	"\n",
	"# Precommit Job Times\n",
	"This notebook fetches test statistics from Jenkins.\n",
	"\n",
	"## Requirements\n",
	"\n",
	"```shell\n",
	"pip install pandas matplotlib requests\n",
	"# You may need to restart Jupyter for matplotlib to work.\n",
	"```\n",
	"\n",
	"Note: Requests to `builds.apache.org` must contain a ?depth= or ?tree= argument, otherwise your IP will get banned. [Policy](https://cwiki.apache.org/confluence/display/INFRA/Using+the+ASF+Jenkins+API)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"slideshow": {
	"slide_type": "-"
	}
	},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"import matplotlib.pyplot as plt\n",
	"import matplotlib.dates as md\n",
	"import requests"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Fetch precommit job data from Jenkins.\n",
	"\n",
	"class Build(dict):\n",
	" def __init__(self, job_name, json):\n",
	" self['job_name'] = job_name\n",
	" self['result'] = json['result']\n",
	" self['number'] = json['number']\n",
	" self['timestamp'] = pd.Timestamp.utcfromtimestamp(json['timestamp'] / 1000)\n",
	" self['queuingDurationMillis'] = -1\n",
	" self['totalDurationMillis'] = -1\n",
	" for action in json['actions']:\n",
	" if action.get('_class', None) == 'jenkins.metrics.impl.TimeInQueueAction':\n",
	" self['queuingDurationMinutes'] = action['queuingDurationMillis'] / 60000.\n",
	" self['totalDurationMinutes'] = action['totalDurationMillis'] / 60000.\n",
	" if self['queuingDurationMinutes'] == -1:\n",
	" raise ValueError('could not find queuingDurationMillis in: %s', json)\n",
	" if self['totalDurationMinutes'] == -1:\n",
	" raise ValueError('could not find totalDurationMillis in: %s', json)\n",
	" \n",
	"# Can be 'builds' (last 50) or 'allBuilds'.\n",
	"builds_key = 'allBuilds' \n",
	"\n",
	"builds = []\n",
	"job_names = ['beam_PreCommit_Java_Cron', 'beam_PreCommit_Python_Cron', 'beam_PreCommit_Go_Cron']\n",
	"for job_name in job_names:\n",
	" url = 'https://builds.apache.org/job/%s/api/json' % job_name\n",
	" params = {\n",
	" 'tree': '%s[result,number,timestamp,actions[queuingDurationMillis,totalDurationMillis]]' % builds_key}\n",
	" r = requests.get(url, params=params)\n",
	" data = r.json()\n",
	" builds.extend([Build(job_name, build_json)\n",
	" for build_json in data[builds_key]])\n",
	"\n",
	"df = pd.DataFrame(builds)\n",
	"\n",
	"timestamp_cutoff = pd.Timestamp.utcnow().tz_convert(None) - pd.Timedelta(weeks=4)\n",
	"df_4weeks = df[df.timestamp >= timestamp_cutoff]\n",
	"timestamp_cutoff = pd.Timestamp.utcnow().tz_convert(None) - pd.Timedelta(weeks=1)\n",
	"df_1week = df[df.timestamp >= timestamp_cutoff]\n",
	"timestamp_cutoff = pd.Timestamp.utcnow().tz_convert(None) - pd.Timedelta(days=1)\n",
	"df_1day = df[df.timestamp >= timestamp_cutoff]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Graphs of precommit job times.\n",
	"\n",
	"for job_name in job_names:\n",
	" duration_df = df_4weeks[df_4weeks.job_name == job_name]\n",
	" duration_df = duration_df[['timestamp', 'queuingDurationMinutes', 'totalDurationMinutes']]\n",
	" ax = duration_df.plot(x='timestamp')\n",
	" ax.set_title(job_name)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Get 95th percentile of precommit run times.\n",
	"test_dfs = {'4 weeks': df_4weeks, '1 week': df_1week, '1 day': df_1day}\n",
	"metrics = []\n",
	"\n",
	"for sample_time, test_df in test_dfs.items():\n",
	" for job_name in job_names:\n",
	" df_times = test_df[test_df.job_name == job_name]\n",
	" for percentile in [95]:\n",
	" total_all = np.percentile(df_times.totalDurationMinutes, q=percentile)\n",
	" total_success = np.percentile(df_times[df_times.result == 'SUCCESS'].totalDurationMinutes,\n",
	" q=percentile)\n",
	" queue = np.percentile(df_times.queuingDurationMinutes, q=percentile)\n",
	" metrics.append({'job_name': '%s %s %dth' % (\n",
	" job_name.replace('beam_PreCommit_','').replace('_GradleBuild',''),\n",
	" sample_time, percentile),\n",
	" 'totalDurationMinutes_all': total_all,\n",
	" 'totalDurationMinutes_success_only': total_success,\n",
	" 'queuingDurationMinutes': queue,\n",
	" })\n",
	"\n",
	"pd.DataFrame(metrics).sort_values('job_name')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Fetch individual test data (precommit) from Jenkins.\n",
	"MAX_FETCH_PER_JOB_TYPE = 5\n",
	"\n",
	"test_results_raw = []\n",
	"for job_name in list(df.job_name.unique()):\n",
	" if job_name == 'beam_PreCommit_Go_Cron':\n",
	" # TODO: Go builds are missing testReport data on Jenkins.\n",
	" continue\n",
	" build_nums = list(df.number[df.job_name == job_name].unique())\n",
	" num_fetched = 0\n",
	" for build_num in build_nums:\n",
	" url = 'https://builds.apache.org/job/%s/%s/testReport/api/json?depth=1' % (job_name, build_num)\n",
	" print('.', end='')\n",
	" r = requests.get(url)\n",
	" if not r.ok:\n",
	" # Typically a 404 means that the job is still running.\n",
	" print('skipping (%s): %s' % (r.status_code, url))\n",
	" continue\n",
	" raw_result = r.json()\n",
	" raw_result['job_name'] = job_name\n",
	" raw_result['build_num'] = build_num\n",
	" test_results_raw.append(raw_result)\n",
	" \n",
	" num_fetched += 1\n",
	" if num_fetched >= MAX_FETCH_PER_JOB_TYPE:\n",
	" break\n",
	"\n",
	"print(' done')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Analyze individual test results.\n",
	"\n",
	"class TestResult(dict):\n",
	" def __init__(self, job_name, build_num, json):\n",
	" self['job_name'] = job_name\n",
	" self['build_num'] = build_num\n",
	" self['name'] = json['name']\n",
	" self['duration'] = json['duration']\n",
	" self['className'] = json['className']\n",
	" self['status'] = json['status']\n",
	"\n",
	"test_results = []\n",
	"for test_result_raw in test_results_raw:\n",
	" job_name = test_result_raw['job_name']\n",
	" build_num = test_result_raw['build_num']\n",
	" for suite in test_result_raw['suites']:\n",
	" for case in suite['cases']:\n",
	" test_results.append(TestResult(job_name, build_num, case))\n",
	"\n",
	"df_tests = pd.DataFrame(test_results)\n",
	"df_tests = df_tests.drop(columns=['build_num'])\n",
	"df_tests = df_tests.groupby(['className', 'job_name', 'name', 'status'], as_index=False).max()\n",
	"df_tests = df_tests.sort_values('duration', ascending=False)\n",
	"\n",
	"def filter_test_results(job_name, status):\n",
	" res = df_tests\n",
	" if job_name != 'all':\n",
	" res = res[res.job_name == job_name]\n",
	" if status != 'all':\n",
	" res = res[res.status == status]\n",
	" return res.head(n=20)\n",
	"\n",
	"from ipywidgets import interact\n",
	"interact(filter_test_results,\n",
	" job_name=['all'] + list(df_tests.job_name.unique()),\n",
	" status=['all'] + list(df_tests.status.unique()))"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}