| { |
| "cells": [ |
| { |
| "cell_type": "code", |
| "execution_count": 1, |
| "id": "509d5a1e-10f2-4294-a104-9fdb85a29b0c", |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stderr", |
| "output_type": "stream", |
| "text": [ |
| "SLF4J: Class path contains multiple SLF4J bindings.\n", |
| "SLF4J: Found binding in [jar:file:/root/spark-3.3.1-bin-hadoop2/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n", |
| "SLF4J: Found binding in [jar:file:/root/hadoop-2.7.7/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n", |
| "SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.\n", |
| "SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]\n", |
| "24/04/03 22:04:56 WARN Utils: Your hostname, vsr542 resolves to a loopback address: 127.0.1.1; using 10.0.2.142 instead (on interface eno1)\n", |
| "24/04/03 22:04:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", |
| "Setting default log level to \"WARN\".\n", |
| "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", |
| "24/04/03 22:04:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", |
| "24/04/03 22:05:00 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.\n" |
| ] |
| } |
| ], |
| "source": [ |
| "import findspark\n", |
| "findspark.init()\n", |
| "\n", |
| "from pyspark import SparkConf, SparkContext\n", |
| "nativesql_jars = \"/path/to/gluten-XXXX.jar\"\n", |
| "conf = SparkConf().setAppName(\"PySpark Gluten\").setMaster(\"yarn\")\n", |
| "conf.set(\"spark.executor.instances\", \"2\")\n", |
| "conf.set(\"spark.executor.memory\", \"6g\")\n", |
| "conf.set(\"spark.executor.cores\", \"2\")\n", |
| "conf.set(\"spark.driver.memory\", \"2g\")\n", |
| "conf.set(\"spark.memory.offHeap.enabled\", \"true\")\n", |
| "conf.set(\"spark.memory.offHeap.size\", \"2g\")\n", |
| "conf.set(\"spark.executor.memoryOverhead\", \"384M\")\n", |
| "conf.set(\"spark.driver.extraClassPath\", nativesql_jars)\n", |
| "conf.set(\"spark.executor.extraClassPath\", nativesql_jars)\n", |
| "conf.set(\"spark.plugins\", \"org.apache.gluten.GlutenPlugin\")\n", |
| "conf.set(\"spark.gluten.loadLibFromJar\", \"false\")\n", |
| "conf.set(\"spark.shuffle.manager\", \"org.apache.spark.shuffle.sort.ColumnarShuffleManager\")\n", |
| "sc = SparkContext(conf=conf)\n", |
| "from pyspark.sql import SparkSession\n", |
| "spark_session = SparkSession(sc)\n" |
| ] |
| }, |
| { |
| "cell_type": "code", |
| "execution_count": 2, |
| "id": "6c4ec66a-3b8c-4a65-b948-8420b492333d", |
| "metadata": {}, |
| "outputs": [ |
| { |
| "name": "stdout", |
| "output_type": "stream", |
| "text": [ |
| "== Physical Plan ==\n", |
| "VeloxColumnarToRowExec\n", |
| "+- ^(1) FilterExecTransformer (isnotnull(category#1) AND (category#1 = Cellphone))\n", |
| " +- ^(1) InputIteratorTransformer[product#0, category#1, revenue#2L]\n", |
| " +- ^(1) InputAdapter\n", |
| " +- ^(1) RowToVeloxColumnar\n", |
| " +- *(1) Scan ExistingRDD[product#0,category#1,revenue#2L]\n", |
| "\n", |
| "\n" |
| ] |
| } |
| ], |
| "source": [ |
| "df = spark_session.createDataFrame(\n", |
| " [\n", |
| " (\"Normal\", \"Cellphone\", 6000),\n", |
| " (\"Normal\", \"Tablet\", 1500),\n", |
| " (\"Mini\", \"Tablet\", 5500),\n", |
| " (\"Mini\", \"Cellphone\", 5000),\n", |
| " (\"Foldable\", \"Cellphone\", 6500),\n", |
| " (\"Foldable\", \"Tablet\", 2500),\n", |
| " (\"Pro\", \"Cellphone\", 3000),\n", |
| " (\"Pro\", \"Tablet\", 4000),\n", |
| " (\"Pro Max\", \"Cellphone\", 4500)\n", |
| " ],\n", |
| " [\"product\", \"category\", \"revenue\"]\n", |
| ")\n", |
| "df.filter(\"category = 'Cellphone'\").explain()" |
| ] |
| } |
| ], |
| "metadata": { |
| "kernelspec": { |
| "display_name": "Python 3 (ipykernel)", |
| "language": "python", |
| "name": "python3" |
| }, |
| "language_info": { |
| "codemirror_mode": { |
| "name": "ipython", |
| "version": 3 |
| }, |
| "file_extension": ".py", |
| "mimetype": "text/x-python", |
| "name": "python", |
| "nbconvert_exporter": "python", |
| "pygments_lexer": "ipython3", |
| "version": "3.9.19" |
| }, |
| "nbTranslate": { |
| "displayLangs": [ |
| "*" |
| ], |
| "hotkey": "alt-t", |
| "langInMainMenu": true, |
| "sourceLang": "en", |
| "targetLang": "fr", |
| "useGoogleTranslate": true |
| }, |
| "toc": { |
| "base_numbering": 1, |
| "nav_menu": {}, |
| "number_sections": true, |
| "sideBar": true, |
| "skip_h1_title": false, |
| "title_cell": "Table of Contents", |
| "title_sidebar": "Contents", |
| "toc_cell": false, |
| "toc_position": {}, |
| "toc_section_display": true, |
| "toc_window_display": false |
| } |
| }, |
| "nbformat": 4, |
| "nbformat_minor": 5 |
| } |