| |
| <!DOCTYPE html> |
| |
| <html> |
| <head> |
| <meta charset="utf-8" /> |
| <title>pyspark.mllib.linalg.distributed — PySpark 3.2.3 documentation</title> |
| |
| <link rel="stylesheet" href="../../../../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css"> |
| |
| |
| <link rel="stylesheet" |
| href="../../../../_static/vendor/fontawesome/5.13.0/css/all.min.css"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2"> |
| |
| |
| |
| <link rel="stylesheet" |
| href="../../../../_static/vendor/open-sans_all/1.44.1/index.css"> |
| <link rel="stylesheet" |
| href="../../../../_static/vendor/lato_latin-ext/1.44.1/index.css"> |
| |
| |
| <link rel="stylesheet" href="../../../../_static/basic.css" type="text/css" /> |
| <link rel="stylesheet" href="../../../../_static/pygments.css" type="text/css" /> |
| <link rel="stylesheet" type="text/css" href="../../../../_static/css/pyspark.css" /> |
| |
| <link rel="preload" as="script" href="../../../../_static/js/index.3da636dd464baa7582d2.js"> |
| |
| <script id="documentation_options" data-url_root="../../../../" src="../../../../_static/documentation_options.js"></script> |
| <script src="../../../../_static/jquery.js"></script> |
| <script src="../../../../_static/underscore.js"></script> |
| <script src="../../../../_static/doctools.js"></script> |
| <script src="../../../../_static/language_data.js"></script> |
| <script src="../../../../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script> |
| <script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script> |
| <link rel="search" title="Search" href="../../../../search.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="en" /> |
| </head> |
| <body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80"> |
| |
| <nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"> |
| <div class="container-xl"> |
| |
| <a class="navbar-brand" href="../../../../index.html"> |
| |
| <img src="../../../../_static/spark-logo-reverse.png" class="logo" alt="logo" /> |
| |
| </a> |
| <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation"> |
| <span class="navbar-toggler-icon"></span> |
| </button> |
| |
| <div id="navbar-menu" class="col-lg-9 collapse navbar-collapse"> |
| <ul id="navbar-main-elements" class="navbar-nav mr-auto"> |
| |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../../getting_started/index.html">Getting Started</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../../user_guide/index.html">User Guide</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../../reference/index.html">API Reference</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../../development/index.html">Development</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../../migration_guide/index.html">Migration Guide</a> |
| </li> |
| |
| |
| </ul> |
| |
| |
| |
| |
| <ul class="navbar-nav"> |
| |
| |
| </ul> |
| </div> |
| </div> |
| </nav> |
| |
| |
| <div class="container-xl"> |
| <div class="row"> |
| |
| <div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../../../../search.html" method="get"> |
| <i class="icon fas fa-search"></i> |
| <input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" > |
| </form> |
| <nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation"> |
| |
| <div class="bd-toc-item active"> |
| |
| |
| <ul class="nav bd-sidenav"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </ul> |
| |
| </nav> |
| </div> |
| |
| |
| |
| <div class="d-none d-xl-block col-xl-2 bd-toc"> |
| |
| |
| <nav id="bd-toc-nav"> |
| <ul class="nav section-nav flex-column"> |
| |
| </ul> |
| </nav> |
| |
| |
| |
| </div> |
| |
| |
| |
| <main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main"> |
| |
| <div> |
| |
| <h1>Source code for pyspark.mllib.linalg.distributed</h1><div class="highlight"><pre> |
| <span></span><span class="c1">#</span> |
| <span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span> |
| <span class="c1"># contributor license agreements. See the NOTICE file distributed with</span> |
| <span class="c1"># this work for additional information regarding copyright ownership.</span> |
| <span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span> |
| <span class="c1"># (the "License"); you may not use this file except in compliance with</span> |
| <span class="c1"># the License. You may obtain a copy of the License at</span> |
| <span class="c1">#</span> |
| <span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span> |
| <span class="c1">#</span> |
| <span class="c1"># Unless required by applicable law or agreed to in writing, software</span> |
| <span class="c1"># distributed under the License is distributed on an "AS IS" BASIS,</span> |
| <span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span> |
| <span class="c1"># See the License for the specific language governing permissions and</span> |
| <span class="c1"># limitations under the License.</span> |
| <span class="c1">#</span> |
| |
| <span class="sd">"""</span> |
| <span class="sd">Package for distributed linear algebra.</span> |
| <span class="sd">"""</span> |
| |
| <span class="kn">import</span> <span class="nn">sys</span> |
| |
| <span class="kn">from</span> <span class="nn">py4j.java_gateway</span> <span class="kn">import</span> <span class="n">JavaObject</span> |
| |
| <span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">RDD</span><span class="p">,</span> <span class="n">since</span> |
| <span class="kn">from</span> <span class="nn">pyspark.mllib.common</span> <span class="kn">import</span> <span class="n">callMLlibFunc</span><span class="p">,</span> <span class="n">JavaModelWrapper</span> |
| <span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="kn">import</span> <span class="n">_convert_to_vector</span><span class="p">,</span> <span class="n">DenseMatrix</span><span class="p">,</span> <span class="n">Matrix</span><span class="p">,</span> <span class="n">QRDecomposition</span> |
| <span class="kn">from</span> <span class="nn">pyspark.mllib.stat</span> <span class="kn">import</span> <span class="n">MultivariateStatisticalSummary</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">DataFrame</span> |
| <span class="kn">from</span> <span class="nn">pyspark.storagelevel</span> <span class="kn">import</span> <span class="n">StorageLevel</span> |
| |
| |
| <span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span><span class="s1">'BlockMatrix'</span><span class="p">,</span> <span class="s1">'CoordinateMatrix'</span><span class="p">,</span> <span class="s1">'DistributedMatrix'</span><span class="p">,</span> <span class="s1">'IndexedRow'</span><span class="p">,</span> |
| <span class="s1">'IndexedRowMatrix'</span><span class="p">,</span> <span class="s1">'MatrixEntry'</span><span class="p">,</span> <span class="s1">'RowMatrix'</span><span class="p">,</span> <span class="s1">'SingularValueDecomposition'</span><span class="p">]</span> |
| |
| |
| <div class="viewcode-block" id="DistributedMatrix"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.DistributedMatrix.html#pyspark.mllib.linalg.distributed.DistributedMatrix">[docs]</a><span class="k">class</span> <span class="nc">DistributedMatrix</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Represents a distributively stored matrix backed by one or</span> |
| <span class="sd"> more RDDs.</span> |
| |
| <span class="sd"> """</span> |
| <div class="viewcode-block" id="DistributedMatrix.numRows"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.DistributedMatrix.html#pyspark.mllib.linalg.distributed.DistributedMatrix.numRows">[docs]</a> <span class="k">def</span> <span class="nf">numRows</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""Get or compute the number of rows."""</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span></div> |
| |
| <div class="viewcode-block" id="DistributedMatrix.numCols"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.DistributedMatrix.html#pyspark.mllib.linalg.distributed.DistributedMatrix.numCols">[docs]</a> <span class="k">def</span> <span class="nf">numCols</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""Get or compute the number of cols."""</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span></div></div> |
| |
| |
| <div class="viewcode-block" id="RowMatrix"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.RowMatrix.html#pyspark.mllib.linalg.distributed.RowMatrix">[docs]</a><span class="k">class</span> <span class="nc">RowMatrix</span><span class="p">(</span><span class="n">DistributedMatrix</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Represents a row-oriented distributed Matrix with no meaningful</span> |
| <span class="sd"> row indices.</span> |
| |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> rows : :py:class:`pyspark.RDD` or :py:class:`pyspark.sql.DataFrame`</span> |
| <span class="sd"> An RDD or DataFrame of vectors. If a DataFrame is provided, it must have a single</span> |
| <span class="sd"> vector typed column.</span> |
| <span class="sd"> numRows : int, optional</span> |
| <span class="sd"> Number of rows in the matrix. A non-positive</span> |
| <span class="sd"> value means unknown, at which point the number</span> |
| <span class="sd"> of rows will be determined by the number of</span> |
| <span class="sd"> records in the `rows` RDD.</span> |
| <span class="sd"> numCols : int, optional</span> |
| <span class="sd"> Number of columns in the matrix. A non-positive</span> |
| <span class="sd"> value means unknown, at which point the number</span> |
| <span class="sd"> of columns will be determined by the size of</span> |
| <span class="sd"> the first row.</span> |
| <span class="sd"> """</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">rows</span><span class="p">,</span> <span class="n">numRows</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">numCols</span><span class="o">=</span><span class="mi">0</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Note: This docstring is not shown publicly.</span> |
| |
| <span class="sd"> Create a wrapper over a Java RowMatrix.</span> |
| |
| <span class="sd"> Publicly, we require that `rows` be an RDD or DataFrame. However, for</span> |
| <span class="sd"> internal usage, `rows` can also be a Java RowMatrix</span> |
| <span class="sd"> object, in which case we can wrap it directly. This</span> |
| <span class="sd"> assists in clean matrix conversions.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]])</span> |
| <span class="sd"> >>> mat = RowMatrix(rows)</span> |
| |
| <span class="sd"> >>> mat_diff = RowMatrix(rows)</span> |
| <span class="sd"> >>> (mat_diff._java_matrix_wrapper._java_model ==</span> |
| <span class="sd"> ... mat._java_matrix_wrapper._java_model)</span> |
| <span class="sd"> False</span> |
| |
| <span class="sd"> >>> mat_same = RowMatrix(mat._java_matrix_wrapper._java_model)</span> |
| <span class="sd"> >>> (mat_same._java_matrix_wrapper._java_model ==</span> |
| <span class="sd"> ... mat._java_matrix_wrapper._java_model)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">rows</span><span class="p">,</span> <span class="n">RDD</span><span class="p">):</span> |
| <span class="n">rows</span> <span class="o">=</span> <span class="n">rows</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">_convert_to_vector</span><span class="p">)</span> |
| <span class="n">java_matrix</span> <span class="o">=</span> <span class="n">callMLlibFunc</span><span class="p">(</span><span class="s2">"createRowMatrix"</span><span class="p">,</span> <span class="n">rows</span><span class="p">,</span> <span class="nb">int</span><span class="p">(</span><span class="n">numRows</span><span class="p">),</span> <span class="nb">int</span><span class="p">(</span><span class="n">numCols</span><span class="p">))</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">rows</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">):</span> |
| <span class="n">java_matrix</span> <span class="o">=</span> <span class="n">callMLlibFunc</span><span class="p">(</span><span class="s2">"createRowMatrix"</span><span class="p">,</span> <span class="n">rows</span><span class="p">,</span> <span class="nb">int</span><span class="p">(</span><span class="n">numRows</span><span class="p">),</span> <span class="nb">int</span><span class="p">(</span><span class="n">numCols</span><span class="p">))</span> |
| <span class="k">elif</span> <span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">rows</span><span class="p">,</span> <span class="n">JavaObject</span><span class="p">)</span> |
| <span class="ow">and</span> <span class="n">rows</span><span class="o">.</span><span class="n">getClass</span><span class="p">()</span><span class="o">.</span><span class="n">getSimpleName</span><span class="p">()</span> <span class="o">==</span> <span class="s2">"RowMatrix"</span><span class="p">):</span> |
| <span class="n">java_matrix</span> <span class="o">=</span> <span class="n">rows</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"rows should be an RDD of vectors, got </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">rows</span><span class="p">))</span> |
| |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span> <span class="o">=</span> <span class="n">JavaModelWrapper</span><span class="p">(</span><span class="n">java_matrix</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">rows</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Rows of the RowMatrix stored as an RDD of vectors.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> mat = RowMatrix(sc.parallelize([[1, 2, 3], [4, 5, 6]]))</span> |
| <span class="sd"> >>> rows = mat.rows</span> |
| <span class="sd"> >>> rows.first()</span> |
| <span class="sd"> DenseVector([1.0, 2.0, 3.0])</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"rows"</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="RowMatrix.numRows"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.RowMatrix.html#pyspark.mllib.linalg.distributed.RowMatrix.numRows">[docs]</a> <span class="k">def</span> <span class="nf">numRows</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Get or compute the number of rows.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6],</span> |
| <span class="sd"> ... [7, 8, 9], [10, 11, 12]])</span> |
| |
| <span class="sd"> >>> mat = RowMatrix(rows)</span> |
| <span class="sd"> >>> print(mat.numRows())</span> |
| <span class="sd"> 4</span> |
| |
| <span class="sd"> >>> mat = RowMatrix(rows, 7, 6)</span> |
| <span class="sd"> >>> print(mat.numRows())</span> |
| <span class="sd"> 7</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"numRows"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RowMatrix.numCols"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.RowMatrix.html#pyspark.mllib.linalg.distributed.RowMatrix.numCols">[docs]</a> <span class="k">def</span> <span class="nf">numCols</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Get or compute the number of cols.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6],</span> |
| <span class="sd"> ... [7, 8, 9], [10, 11, 12]])</span> |
| |
| <span class="sd"> >>> mat = RowMatrix(rows)</span> |
| <span class="sd"> >>> print(mat.numCols())</span> |
| <span class="sd"> 3</span> |
| |
| <span class="sd"> >>> mat = RowMatrix(rows, 7, 6)</span> |
| <span class="sd"> >>> print(mat.numCols())</span> |
| <span class="sd"> 6</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"numCols"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RowMatrix.computeColumnSummaryStatistics"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.RowMatrix.html#pyspark.mllib.linalg.distributed.RowMatrix.computeColumnSummaryStatistics">[docs]</a> <span class="k">def</span> <span class="nf">computeColumnSummaryStatistics</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Computes column-wise summary statistics.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :py:class:`MultivariateStatisticalSummary`</span> |
| <span class="sd"> object containing column-wise summary statistics.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]])</span> |
| <span class="sd"> >>> mat = RowMatrix(rows)</span> |
| |
| <span class="sd"> >>> colStats = mat.computeColumnSummaryStatistics()</span> |
| <span class="sd"> >>> colStats.mean()</span> |
| <span class="sd"> array([ 2.5, 3.5, 4.5])</span> |
| <span class="sd"> """</span> |
| <span class="n">java_col_stats</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"computeColumnSummaryStatistics"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">MultivariateStatisticalSummary</span><span class="p">(</span><span class="n">java_col_stats</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RowMatrix.computeCovariance"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.RowMatrix.html#pyspark.mllib.linalg.distributed.RowMatrix.computeCovariance">[docs]</a> <span class="k">def</span> <span class="nf">computeCovariance</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Computes the covariance matrix, treating each row as an</span> |
| <span class="sd"> observation.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This cannot be computed on matrices with more than 65535 columns.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> rows = sc.parallelize([[1, 2], [2, 1]])</span> |
| <span class="sd"> >>> mat = RowMatrix(rows)</span> |
| |
| <span class="sd"> >>> mat.computeCovariance()</span> |
| <span class="sd"> DenseMatrix(2, 2, [0.5, -0.5, -0.5, 0.5], 0)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"computeCovariance"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RowMatrix.computeGramianMatrix"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.RowMatrix.html#pyspark.mllib.linalg.distributed.RowMatrix.computeGramianMatrix">[docs]</a> <span class="k">def</span> <span class="nf">computeGramianMatrix</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Computes the Gramian matrix `A^T A`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This cannot be computed on matrices with more than 65535 columns.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> rows = sc.parallelize([[1, 2, 3], [4, 5, 6]])</span> |
| <span class="sd"> >>> mat = RowMatrix(rows)</span> |
| |
| <span class="sd"> >>> mat.computeGramianMatrix()</span> |
| <span class="sd"> DenseMatrix(3, 3, [17.0, 22.0, 27.0, 22.0, 29.0, 36.0, 27.0, 36.0, 45.0], 0)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"computeGramianMatrix"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RowMatrix.columnSimilarities"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.RowMatrix.html#pyspark.mllib.linalg.distributed.RowMatrix.columnSimilarities">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s1">'2.0.0'</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">columnSimilarities</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">threshold</span><span class="o">=</span><span class="mf">0.0</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute similarities between columns of this matrix.</span> |
| |
| <span class="sd"> The threshold parameter is a trade-off knob between estimate</span> |
| <span class="sd"> quality and computational cost.</span> |
| |
| <span class="sd"> The default threshold setting of 0 guarantees deterministically</span> |
| <span class="sd"> correct results, but uses the brute-force approach of computing</span> |
| <span class="sd"> normalized dot products.</span> |
| |
| <span class="sd"> Setting the threshold to positive values uses a sampling</span> |
| <span class="sd"> approach and incurs strictly less computational cost than the</span> |
| <span class="sd"> brute-force approach. However the similarities computed will</span> |
| <span class="sd"> be estimates.</span> |
| |
| <span class="sd"> The sampling guarantees relative-error correctness for those</span> |
| <span class="sd"> pairs of columns that have similarity greater than the given</span> |
| <span class="sd"> similarity threshold.</span> |
| |
| <span class="sd"> To describe the guarantee, we set some notation:</span> |
| |
| <span class="sd"> - Let A be the smallest in magnitude non-zero element of</span> |
| <span class="sd"> this matrix.</span> |
| <span class="sd"> - Let B be the largest in magnitude non-zero element of</span> |
| <span class="sd"> this matrix.</span> |
| <span class="sd"> - Let L be the maximum number of non-zeros per row.</span> |
| |
| <span class="sd"> For example, for {0,1} matrices: A=B=1.</span> |
| <span class="sd"> Another example, for the Netflix matrix: A=1, B=5</span> |
| |
| <span class="sd"> For those column pairs that are above the threshold, the</span> |
| <span class="sd"> computed similarity is correct to within 20% relative error</span> |
| <span class="sd"> with probability at least 1 - (0.981)^10/B^</span> |
| |
| <span class="sd"> The shuffle size is bounded by the *smaller* of the following</span> |
| <span class="sd"> two expressions:</span> |
| |
| <span class="sd"> - O(n log(n) L / (threshold * A))</span> |
| <span class="sd"> - O(m L^2^)</span> |
| |
| <span class="sd"> The latter is the cost of the brute-force approach, so for</span> |
| <span class="sd"> non-zero thresholds, the cost is always cheaper than the</span> |
| <span class="sd"> brute-force approach.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> threshold : float, optional</span> |
| <span class="sd"> Set to 0 for deterministic guaranteed</span> |
| <span class="sd"> correctness. Similarities above this</span> |
| <span class="sd"> threshold are estimated with the cost vs</span> |
| <span class="sd"> estimate quality trade-off described above.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :py:class:`CoordinateMatrix`</span> |
| <span class="sd"> An n x n sparse upper-triangular CoordinateMatrix of</span> |
| <span class="sd"> cosine similarities between columns of this matrix.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> rows = sc.parallelize([[1, 2], [1, 5]])</span> |
| <span class="sd"> >>> mat = RowMatrix(rows)</span> |
| |
| <span class="sd"> >>> sims = mat.columnSimilarities()</span> |
| <span class="sd"> >>> sims.entries.first().value</span> |
| <span class="sd"> 0.91914503...</span> |
| <span class="sd"> """</span> |
| <span class="n">java_sims_mat</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"columnSimilarities"</span><span class="p">,</span> <span class="nb">float</span><span class="p">(</span><span class="n">threshold</span><span class="p">))</span> |
| <span class="k">return</span> <span class="n">CoordinateMatrix</span><span class="p">(</span><span class="n">java_sims_mat</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RowMatrix.tallSkinnyQR"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.RowMatrix.html#pyspark.mllib.linalg.distributed.RowMatrix.tallSkinnyQR">[docs]</a> <span class="k">def</span> <span class="nf">tallSkinnyQR</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">computeQ</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute the QR decomposition of this RowMatrix.</span> |
| |
| <span class="sd"> The implementation is designed to optimize the QR decomposition</span> |
| <span class="sd"> (factorization) for the RowMatrix of a tall and skinny shape [1]_.</span> |
| |
| <span class="sd"> .. [1] Paul G. Constantine, David F. Gleich. "Tall and skinny QR</span> |
| <span class="sd"> factorizations in MapReduce architectures"</span> |
| <span class="sd"> https://doi.org/10.1145/1996092.1996103</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> computeQ : bool, optional</span> |
| <span class="sd"> whether to computeQ</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :py:class:`pyspark.mllib.linalg.QRDecomposition`</span> |
| <span class="sd"> QRDecomposition(Q: RowMatrix, R: Matrix), where</span> |
| <span class="sd"> Q = None if computeQ = false.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> rows = sc.parallelize([[3, -6], [4, -8], [0, 1]])</span> |
| <span class="sd"> >>> mat = RowMatrix(rows)</span> |
| <span class="sd"> >>> decomp = mat.tallSkinnyQR(True)</span> |
| <span class="sd"> >>> Q = decomp.Q</span> |
| <span class="sd"> >>> R = decomp.R</span> |
| |
| <span class="sd"> >>> # Test with absolute values</span> |
| <span class="sd"> >>> absQRows = Q.rows.map(lambda row: abs(row.toArray()).tolist())</span> |
| <span class="sd"> >>> absQRows.collect()</span> |
| <span class="sd"> [[0.6..., 0.0], [0.8..., 0.0], [0.0, 1.0]]</span> |
| |
| <span class="sd"> >>> # Test with absolute values</span> |
| <span class="sd"> >>> abs(R.toArray()).tolist()</span> |
| <span class="sd"> [[5.0, 10.0], [0.0, 1.0]]</span> |
| <span class="sd"> """</span> |
| <span class="n">decomp</span> <span class="o">=</span> <span class="n">JavaModelWrapper</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"tallSkinnyQR"</span><span class="p">,</span> <span class="n">computeQ</span><span class="p">))</span> |
| <span class="k">if</span> <span class="n">computeQ</span><span class="p">:</span> |
| <span class="n">java_Q</span> <span class="o">=</span> <span class="n">decomp</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"Q"</span><span class="p">)</span> |
| <span class="n">Q</span> <span class="o">=</span> <span class="n">RowMatrix</span><span class="p">(</span><span class="n">java_Q</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">Q</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="n">R</span> <span class="o">=</span> <span class="n">decomp</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"R"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">QRDecomposition</span><span class="p">(</span><span class="n">Q</span><span class="p">,</span> <span class="n">R</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RowMatrix.computeSVD"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.RowMatrix.html#pyspark.mllib.linalg.distributed.RowMatrix.computeSVD">[docs]</a> <span class="k">def</span> <span class="nf">computeSVD</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">k</span><span class="p">,</span> <span class="n">computeU</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">rCond</span><span class="o">=</span><span class="mf">1e-9</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Computes the singular value decomposition of the RowMatrix.</span> |
| |
| <span class="sd"> The given row matrix A of dimension (m X n) is decomposed into</span> |
| <span class="sd"> U * s * V'T where</span> |
| |
| <span class="sd"> - U: (m X k) (left singular vectors) is a RowMatrix whose</span> |
| <span class="sd"> columns are the eigenvectors of (A X A')</span> |
| <span class="sd"> - s: DenseVector consisting of square root of the eigenvalues</span> |
| <span class="sd"> (singular values) in descending order.</span> |
| <span class="sd"> - v: (n X k) (right singular vectors) is a Matrix whose columns</span> |
| <span class="sd"> are the eigenvectors of (A' X A)</span> |
| |
| <span class="sd"> For more specific details on implementation, please refer</span> |
| <span class="sd"> the Scala documentation.</span> |
| |
| <span class="sd"> .. versionadded:: 2.2.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> k : int</span> |
| <span class="sd"> Number of leading singular values to keep (`0 < k <= n`).</span> |
| <span class="sd"> It might return less than k if there are numerically zero singular values</span> |
| <span class="sd"> or there are not enough Ritz values converged before the maximum number of</span> |
| <span class="sd"> Arnoldi update iterations is reached (in case that matrix A is ill-conditioned).</span> |
| <span class="sd"> computeU : bool, optional</span> |
| <span class="sd"> Whether or not to compute U. If set to be</span> |
| <span class="sd"> True, then U is computed by A * V * s^-1</span> |
| <span class="sd"> rCond : float, optional</span> |
| <span class="sd"> Reciprocal condition number. All singular values</span> |
| <span class="sd"> smaller than rCond * s[0] are treated as zero</span> |
| <span class="sd"> where s[0] is the largest singular value.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :py:class:`SingularValueDecomposition`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> rows = sc.parallelize([[3, 1, 1], [-1, 3, 1]])</span> |
| <span class="sd"> >>> rm = RowMatrix(rows)</span> |
| |
| <span class="sd"> >>> svd_model = rm.computeSVD(2, True)</span> |
| <span class="sd"> >>> svd_model.U.rows.collect()</span> |
| <span class="sd"> [DenseVector([-0.7071, 0.7071]), DenseVector([-0.7071, -0.7071])]</span> |
| <span class="sd"> >>> svd_model.s</span> |
| <span class="sd"> DenseVector([3.4641, 3.1623])</span> |
| <span class="sd"> >>> svd_model.V</span> |
| <span class="sd"> DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, 0.0], 0)</span> |
| <span class="sd"> """</span> |
| <span class="n">j_model</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span> |
| <span class="s2">"computeSVD"</span><span class="p">,</span> <span class="nb">int</span><span class="p">(</span><span class="n">k</span><span class="p">),</span> <span class="nb">bool</span><span class="p">(</span><span class="n">computeU</span><span class="p">),</span> <span class="nb">float</span><span class="p">(</span><span class="n">rCond</span><span class="p">))</span> |
| <span class="k">return</span> <span class="n">SingularValueDecomposition</span><span class="p">(</span><span class="n">j_model</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RowMatrix.computePrincipalComponents"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.RowMatrix.html#pyspark.mllib.linalg.distributed.RowMatrix.computePrincipalComponents">[docs]</a> <span class="k">def</span> <span class="nf">computePrincipalComponents</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">k</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Computes the k principal components of the given row matrix</span> |
| |
| <span class="sd"> .. versionadded:: 2.2.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This cannot be computed on matrices with more than 65535 columns.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> k : int</span> |
| <span class="sd"> Number of principal components to keep.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :py:class:`pyspark.mllib.linalg.DenseMatrix`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> rows = sc.parallelize([[1, 2, 3], [2, 4, 5], [3, 6, 1]])</span> |
| <span class="sd"> >>> rm = RowMatrix(rows)</span> |
| |
| <span class="sd"> >>> # Returns the two principal components of rm</span> |
| <span class="sd"> >>> pca = rm.computePrincipalComponents(2)</span> |
| <span class="sd"> >>> pca</span> |
| <span class="sd"> DenseMatrix(3, 2, [-0.349, -0.6981, 0.6252, -0.2796, -0.5592, -0.7805], 0)</span> |
| |
| <span class="sd"> >>> # Transform into new dimensions with the greatest variance.</span> |
| <span class="sd"> >>> rm.multiply(pca).rows.collect() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> [DenseVector([0.1305, -3.7394]), DenseVector([-0.3642, -6.6983]), \</span> |
| <span class="sd"> DenseVector([-4.6102, -4.9745])]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"computePrincipalComponents"</span><span class="p">,</span> <span class="n">k</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="RowMatrix.multiply"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.RowMatrix.html#pyspark.mllib.linalg.distributed.RowMatrix.multiply">[docs]</a> <span class="k">def</span> <span class="nf">multiply</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">matrix</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Multiply this matrix by a local dense matrix on the right.</span> |
| |
| <span class="sd"> .. versionadded:: 2.2.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> matrix : :py:class:`pyspark.mllib.linalg.Matrix`</span> |
| <span class="sd"> a local dense matrix whose number of rows must match the number of columns</span> |
| <span class="sd"> of this matrix</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :py:class:`RowMatrix`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> rm = RowMatrix(sc.parallelize([[0, 1], [2, 3]]))</span> |
| <span class="sd"> >>> rm.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect()</span> |
| <span class="sd"> [DenseVector([2.0, 3.0]), DenseVector([6.0, 11.0])]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">matrix</span><span class="p">,</span> <span class="n">DenseMatrix</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Only multiplication with DenseMatrix is supported."</span><span class="p">)</span> |
| <span class="n">j_model</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"multiply"</span><span class="p">,</span> <span class="n">matrix</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">RowMatrix</span><span class="p">(</span><span class="n">j_model</span><span class="p">)</span></div></div> |
| |
| |
| <div class="viewcode-block" id="SingularValueDecomposition"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.SingularValueDecomposition.html#pyspark.mllib.linalg.distributed.SingularValueDecomposition">[docs]</a><span class="k">class</span> <span class="nc">SingularValueDecomposition</span><span class="p">(</span><span class="n">JavaModelWrapper</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Represents singular value decomposition (SVD) factors.</span> |
| |
| <span class="sd"> .. versionadded:: 2.2.0</span> |
| <span class="sd"> """</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s1">'2.2.0'</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">U</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Returns a distributed matrix whose columns are the left</span> |
| <span class="sd"> singular vectors of the SingularValueDecomposition if computeU was set to be True.</span> |
| <span class="sd"> """</span> |
| <span class="n">u</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"U"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">u</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">mat_name</span> <span class="o">=</span> <span class="n">u</span><span class="o">.</span><span class="n">getClass</span><span class="p">()</span><span class="o">.</span><span class="n">getSimpleName</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">mat_name</span> <span class="o">==</span> <span class="s2">"RowMatrix"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">RowMatrix</span><span class="p">(</span><span class="n">u</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="n">mat_name</span> <span class="o">==</span> <span class="s2">"IndexedRowMatrix"</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">IndexedRowMatrix</span><span class="p">(</span><span class="n">u</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Expected RowMatrix/IndexedRowMatrix got </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="n">mat_name</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s1">'2.2.0'</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">s</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Returns a DenseVector with singular values in descending order.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"s"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="nd">@since</span><span class="p">(</span><span class="s1">'2.2.0'</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">V</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Returns a DenseMatrix whose columns are the right singular</span> |
| <span class="sd"> vectors of the SingularValueDecomposition.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"V"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="IndexedRow"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.IndexedRow.html#pyspark.mllib.linalg.distributed.IndexedRow">[docs]</a><span class="k">class</span> <span class="nc">IndexedRow</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Represents a row of an IndexedRowMatrix.</span> |
| |
| <span class="sd"> Just a wrapper over a (int, vector) tuple.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> index : int</span> |
| <span class="sd"> The index for the given row.</span> |
| <span class="sd"> vector : :py:class:`pyspark.mllib.linalg.Vector` or convertible</span> |
| <span class="sd"> The row in the matrix at the given index.</span> |
| <span class="sd"> """</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">index</span><span class="p">,</span> <span class="n">vector</span><span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">index</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">index</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">vector</span> <span class="o">=</span> <span class="n">_convert_to_vector</span><span class="p">(</span><span class="n">vector</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="k">return</span> <span class="s2">"IndexedRow(</span><span class="si">%s</span><span class="s2">, </span><span class="si">%s</span><span class="s2">)"</span> <span class="o">%</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">index</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">vector</span><span class="p">)</span></div> |
| |
| |
| <span class="k">def</span> <span class="nf">_convert_to_indexed_row</span><span class="p">(</span><span class="n">row</span><span class="p">):</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">row</span><span class="p">,</span> <span class="n">IndexedRow</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">row</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">row</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">row</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">IndexedRow</span><span class="p">(</span><span class="o">*</span><span class="n">row</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Cannot convert type </span><span class="si">%s</span><span class="s2"> into IndexedRow"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">row</span><span class="p">))</span> |
| |
| |
| <div class="viewcode-block" id="IndexedRowMatrix"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.IndexedRowMatrix.html#pyspark.mllib.linalg.distributed.IndexedRowMatrix">[docs]</a><span class="k">class</span> <span class="nc">IndexedRowMatrix</span><span class="p">(</span><span class="n">DistributedMatrix</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Represents a row-oriented distributed Matrix with indexed rows.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> rows : :py:class:`pyspark.RDD`</span> |
| <span class="sd"> An RDD of IndexedRows or (int, vector) tuples or a DataFrame consisting of a</span> |
| <span class="sd"> int typed column of indices and a vector typed column.</span> |
| <span class="sd"> numRows : int, optional</span> |
| <span class="sd"> Number of rows in the matrix. A non-positive</span> |
| <span class="sd"> value means unknown, at which point the number</span> |
| <span class="sd"> of rows will be determined by the max row</span> |
| <span class="sd"> index plus one.</span> |
| <span class="sd"> numCols : int, optional</span> |
| <span class="sd"> Number of columns in the matrix. A non-positive</span> |
| <span class="sd"> value means unknown, at which point the number</span> |
| <span class="sd"> of columns will be determined by the size of</span> |
| <span class="sd"> the first row.</span> |
| <span class="sd"> """</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">rows</span><span class="p">,</span> <span class="n">numRows</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">numCols</span><span class="o">=</span><span class="mi">0</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Note: This docstring is not shown publicly.</span> |
| |
| <span class="sd"> Create a wrapper over a Java IndexedRowMatrix.</span> |
| |
| <span class="sd"> Publicly, we require that `rows` be an RDD or DataFrame. However, for</span> |
| <span class="sd"> internal usage, `rows` can also be a Java IndexedRowMatrix</span> |
| <span class="sd"> object, in which case we can wrap it directly. This</span> |
| <span class="sd"> assists in clean matrix conversions.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]),</span> |
| <span class="sd"> ... IndexedRow(1, [4, 5, 6])])</span> |
| <span class="sd"> >>> mat = IndexedRowMatrix(rows)</span> |
| |
| <span class="sd"> >>> mat_diff = IndexedRowMatrix(rows)</span> |
| <span class="sd"> >>> (mat_diff._java_matrix_wrapper._java_model ==</span> |
| <span class="sd"> ... mat._java_matrix_wrapper._java_model)</span> |
| <span class="sd"> False</span> |
| |
| <span class="sd"> >>> mat_same = IndexedRowMatrix(mat._java_matrix_wrapper._java_model)</span> |
| <span class="sd"> >>> (mat_same._java_matrix_wrapper._java_model ==</span> |
| <span class="sd"> ... mat._java_matrix_wrapper._java_model)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">rows</span><span class="p">,</span> <span class="n">RDD</span><span class="p">):</span> |
| <span class="n">rows</span> <span class="o">=</span> <span class="n">rows</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">_convert_to_indexed_row</span><span class="p">)</span> |
| <span class="c1"># We use DataFrames for serialization of IndexedRows from</span> |
| <span class="c1"># Python, so first convert the RDD to a DataFrame on this</span> |
| <span class="c1"># side. This will convert each IndexedRow to a Row</span> |
| <span class="c1"># containing the 'index' and 'vector' values, which can</span> |
| <span class="c1"># both be easily serialized. We will convert back to</span> |
| <span class="c1"># IndexedRows on the Scala side.</span> |
| <span class="n">java_matrix</span> <span class="o">=</span> <span class="n">callMLlibFunc</span><span class="p">(</span><span class="s2">"createIndexedRowMatrix"</span><span class="p">,</span> <span class="n">rows</span><span class="o">.</span><span class="n">toDF</span><span class="p">(),</span> |
| <span class="nb">int</span><span class="p">(</span><span class="n">numRows</span><span class="p">),</span> <span class="nb">int</span><span class="p">(</span><span class="n">numCols</span><span class="p">))</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">rows</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">):</span> |
| <span class="n">java_matrix</span> <span class="o">=</span> <span class="n">callMLlibFunc</span><span class="p">(</span><span class="s2">"createIndexedRowMatrix"</span><span class="p">,</span> <span class="n">rows</span><span class="p">,</span> <span class="nb">int</span><span class="p">(</span><span class="n">numRows</span><span class="p">),</span> <span class="nb">int</span><span class="p">(</span><span class="n">numCols</span><span class="p">))</span> |
| <span class="k">elif</span> <span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">rows</span><span class="p">,</span> <span class="n">JavaObject</span><span class="p">)</span> |
| <span class="ow">and</span> <span class="n">rows</span><span class="o">.</span><span class="n">getClass</span><span class="p">()</span><span class="o">.</span><span class="n">getSimpleName</span><span class="p">()</span> <span class="o">==</span> <span class="s2">"IndexedRowMatrix"</span><span class="p">):</span> |
| <span class="n">java_matrix</span> <span class="o">=</span> <span class="n">rows</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"rows should be an RDD of IndexedRows or (int, vector) tuples, "</span> |
| <span class="s2">"got </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">rows</span><span class="p">))</span> |
| |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span> <span class="o">=</span> <span class="n">JavaModelWrapper</span><span class="p">(</span><span class="n">java_matrix</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">rows</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Rows of the IndexedRowMatrix stored as an RDD of IndexedRows.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> mat = IndexedRowMatrix(sc.parallelize([IndexedRow(0, [1, 2, 3]),</span> |
| <span class="sd"> ... IndexedRow(1, [4, 5, 6])]))</span> |
| <span class="sd"> >>> rows = mat.rows</span> |
| <span class="sd"> >>> rows.first()</span> |
| <span class="sd"> IndexedRow(0, [1.0,2.0,3.0])</span> |
| <span class="sd"> """</span> |
| <span class="c1"># We use DataFrames for serialization of IndexedRows from</span> |
| <span class="c1"># Java, so we first convert the RDD of rows to a DataFrame</span> |
| <span class="c1"># on the Scala/Java side. Then we map each Row in the</span> |
| <span class="c1"># DataFrame back to an IndexedRow on this side.</span> |
| <span class="n">rows_df</span> <span class="o">=</span> <span class="n">callMLlibFunc</span><span class="p">(</span><span class="s2">"getIndexedRows"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">_java_model</span><span class="p">)</span> |
| <span class="n">rows</span> <span class="o">=</span> <span class="n">rows_df</span><span class="o">.</span><span class="n">rdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">row</span><span class="p">:</span> <span class="n">IndexedRow</span><span class="p">(</span><span class="n">row</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">row</span><span class="p">[</span><span class="mi">1</span><span class="p">]))</span> |
| <span class="k">return</span> <span class="n">rows</span> |
| |
| <div class="viewcode-block" id="IndexedRowMatrix.numRows"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.IndexedRowMatrix.html#pyspark.mllib.linalg.distributed.IndexedRowMatrix.numRows">[docs]</a> <span class="k">def</span> <span class="nf">numRows</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Get or compute the number of rows.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]),</span> |
| <span class="sd"> ... IndexedRow(1, [4, 5, 6]),</span> |
| <span class="sd"> ... IndexedRow(2, [7, 8, 9]),</span> |
| <span class="sd"> ... IndexedRow(3, [10, 11, 12])])</span> |
| |
| <span class="sd"> >>> mat = IndexedRowMatrix(rows)</span> |
| <span class="sd"> >>> print(mat.numRows())</span> |
| <span class="sd"> 4</span> |
| |
| <span class="sd"> >>> mat = IndexedRowMatrix(rows, 7, 6)</span> |
| <span class="sd"> >>> print(mat.numRows())</span> |
| <span class="sd"> 7</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"numRows"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="IndexedRowMatrix.numCols"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.IndexedRowMatrix.html#pyspark.mllib.linalg.distributed.IndexedRowMatrix.numCols">[docs]</a> <span class="k">def</span> <span class="nf">numCols</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Get or compute the number of cols.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]),</span> |
| <span class="sd"> ... IndexedRow(1, [4, 5, 6]),</span> |
| <span class="sd"> ... IndexedRow(2, [7, 8, 9]),</span> |
| <span class="sd"> ... IndexedRow(3, [10, 11, 12])])</span> |
| |
| <span class="sd"> >>> mat = IndexedRowMatrix(rows)</span> |
| <span class="sd"> >>> print(mat.numCols())</span> |
| <span class="sd"> 3</span> |
| |
| <span class="sd"> >>> mat = IndexedRowMatrix(rows, 7, 6)</span> |
| <span class="sd"> >>> print(mat.numCols())</span> |
| <span class="sd"> 6</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"numCols"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="IndexedRowMatrix.columnSimilarities"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.IndexedRowMatrix.html#pyspark.mllib.linalg.distributed.IndexedRowMatrix.columnSimilarities">[docs]</a> <span class="k">def</span> <span class="nf">columnSimilarities</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute all cosine similarities between columns.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]),</span> |
| <span class="sd"> ... IndexedRow(6, [4, 5, 6])])</span> |
| <span class="sd"> >>> mat = IndexedRowMatrix(rows)</span> |
| <span class="sd"> >>> cs = mat.columnSimilarities()</span> |
| <span class="sd"> >>> print(cs.numCols())</span> |
| <span class="sd"> 3</span> |
| <span class="sd"> """</span> |
| <span class="n">java_coordinate_matrix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"columnSimilarities"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">CoordinateMatrix</span><span class="p">(</span><span class="n">java_coordinate_matrix</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="IndexedRowMatrix.computeGramianMatrix"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.IndexedRowMatrix.html#pyspark.mllib.linalg.distributed.IndexedRowMatrix.computeGramianMatrix">[docs]</a> <span class="k">def</span> <span class="nf">computeGramianMatrix</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Computes the Gramian matrix `A^T A`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This cannot be computed on matrices with more than 65535 columns.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]),</span> |
| <span class="sd"> ... IndexedRow(1, [4, 5, 6])])</span> |
| <span class="sd"> >>> mat = IndexedRowMatrix(rows)</span> |
| |
| <span class="sd"> >>> mat.computeGramianMatrix()</span> |
| <span class="sd"> DenseMatrix(3, 3, [17.0, 22.0, 27.0, 22.0, 29.0, 36.0, 27.0, 36.0, 45.0], 0)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"computeGramianMatrix"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="IndexedRowMatrix.toRowMatrix"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.IndexedRowMatrix.html#pyspark.mllib.linalg.distributed.IndexedRowMatrix.toRowMatrix">[docs]</a> <span class="k">def</span> <span class="nf">toRowMatrix</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Convert this matrix to a RowMatrix.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]),</span> |
| <span class="sd"> ... IndexedRow(6, [4, 5, 6])])</span> |
| <span class="sd"> >>> mat = IndexedRowMatrix(rows).toRowMatrix()</span> |
| <span class="sd"> >>> mat.rows.collect()</span> |
| <span class="sd"> [DenseVector([1.0, 2.0, 3.0]), DenseVector([4.0, 5.0, 6.0])]</span> |
| <span class="sd"> """</span> |
| <span class="n">java_row_matrix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"toRowMatrix"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">RowMatrix</span><span class="p">(</span><span class="n">java_row_matrix</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="IndexedRowMatrix.toCoordinateMatrix"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.IndexedRowMatrix.html#pyspark.mllib.linalg.distributed.IndexedRowMatrix.toCoordinateMatrix">[docs]</a> <span class="k">def</span> <span class="nf">toCoordinateMatrix</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Convert this matrix to a CoordinateMatrix.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> rows = sc.parallelize([IndexedRow(0, [1, 0]),</span> |
| <span class="sd"> ... IndexedRow(6, [0, 5])])</span> |
| <span class="sd"> >>> mat = IndexedRowMatrix(rows).toCoordinateMatrix()</span> |
| <span class="sd"> >>> mat.entries.take(3)</span> |
| <span class="sd"> [MatrixEntry(0, 0, 1.0), MatrixEntry(0, 1, 0.0), MatrixEntry(6, 0, 0.0)]</span> |
| <span class="sd"> """</span> |
| <span class="n">java_coordinate_matrix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"toCoordinateMatrix"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">CoordinateMatrix</span><span class="p">(</span><span class="n">java_coordinate_matrix</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="IndexedRowMatrix.toBlockMatrix"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.IndexedRowMatrix.html#pyspark.mllib.linalg.distributed.IndexedRowMatrix.toBlockMatrix">[docs]</a> <span class="k">def</span> <span class="nf">toBlockMatrix</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">rowsPerBlock</span><span class="o">=</span><span class="mi">1024</span><span class="p">,</span> <span class="n">colsPerBlock</span><span class="o">=</span><span class="mi">1024</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Convert this matrix to a BlockMatrix.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> rowsPerBlock : int, optional</span> |
| <span class="sd"> Number of rows that make up each block.</span> |
| <span class="sd"> The blocks forming the final rows are not</span> |
| <span class="sd"> required to have the given number of rows.</span> |
| <span class="sd"> colsPerBlock : int, optional</span> |
| <span class="sd"> Number of columns that make up each block.</span> |
| <span class="sd"> The blocks forming the final columns are not</span> |
| <span class="sd"> required to have the given number of columns.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> rows = sc.parallelize([IndexedRow(0, [1, 2, 3]),</span> |
| <span class="sd"> ... IndexedRow(6, [4, 5, 6])])</span> |
| <span class="sd"> >>> mat = IndexedRowMatrix(rows).toBlockMatrix()</span> |
| |
| <span class="sd"> >>> # This IndexedRowMatrix will have 7 effective rows, due to</span> |
| <span class="sd"> >>> # the highest row index being 6, and the ensuing</span> |
| <span class="sd"> >>> # BlockMatrix will have 7 rows as well.</span> |
| <span class="sd"> >>> print(mat.numRows())</span> |
| <span class="sd"> 7</span> |
| |
| <span class="sd"> >>> print(mat.numCols())</span> |
| <span class="sd"> 3</span> |
| <span class="sd"> """</span> |
| <span class="n">java_block_matrix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"toBlockMatrix"</span><span class="p">,</span> |
| <span class="n">rowsPerBlock</span><span class="p">,</span> |
| <span class="n">colsPerBlock</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">java_block_matrix</span><span class="p">,</span> <span class="n">rowsPerBlock</span><span class="p">,</span> <span class="n">colsPerBlock</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="IndexedRowMatrix.computeSVD"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.IndexedRowMatrix.html#pyspark.mllib.linalg.distributed.IndexedRowMatrix.computeSVD">[docs]</a> <span class="k">def</span> <span class="nf">computeSVD</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">k</span><span class="p">,</span> <span class="n">computeU</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">rCond</span><span class="o">=</span><span class="mf">1e-9</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Computes the singular value decomposition of the IndexedRowMatrix.</span> |
| |
| <span class="sd"> The given row matrix A of dimension (m X n) is decomposed into</span> |
| <span class="sd"> U * s * V'T where</span> |
| |
| <span class="sd"> * U: (m X k) (left singular vectors) is a IndexedRowMatrix</span> |
| <span class="sd"> whose columns are the eigenvectors of (A X A')</span> |
| <span class="sd"> * s: DenseVector consisting of square root of the eigenvalues</span> |
| <span class="sd"> (singular values) in descending order.</span> |
| <span class="sd"> * v: (n X k) (right singular vectors) is a Matrix whose columns</span> |
| <span class="sd"> are the eigenvectors of (A' X A)</span> |
| |
| <span class="sd"> For more specific details on implementation, please refer</span> |
| <span class="sd"> the scala documentation.</span> |
| |
| <span class="sd"> .. versionadded:: 2.2.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> k : int</span> |
| <span class="sd"> Number of leading singular values to keep (`0 < k <= n`).</span> |
| <span class="sd"> It might return less than k if there are numerically zero singular values</span> |
| <span class="sd"> or there are not enough Ritz values converged before the maximum number of</span> |
| <span class="sd"> Arnoldi update iterations is reached (in case that matrix A is ill-conditioned).</span> |
| <span class="sd"> computeU : bool, optional</span> |
| <span class="sd"> Whether or not to compute U. If set to be</span> |
| <span class="sd"> True, then U is computed by A * V * s^-1</span> |
| <span class="sd"> rCond : float, optional</span> |
| <span class="sd"> Reciprocal condition number. All singular values</span> |
| <span class="sd"> smaller than rCond * s[0] are treated as zero</span> |
| <span class="sd"> where s[0] is the largest singular value.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :py:class:`SingularValueDecomposition`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> rows = [(0, (3, 1, 1)), (1, (-1, 3, 1))]</span> |
| <span class="sd"> >>> irm = IndexedRowMatrix(sc.parallelize(rows))</span> |
| <span class="sd"> >>> svd_model = irm.computeSVD(2, True)</span> |
| <span class="sd"> >>> svd_model.U.rows.collect() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> [IndexedRow(0, [-0.707106781187,0.707106781187]),\</span> |
| <span class="sd"> IndexedRow(1, [-0.707106781187,-0.707106781187])]</span> |
| <span class="sd"> >>> svd_model.s</span> |
| <span class="sd"> DenseVector([3.4641, 3.1623])</span> |
| <span class="sd"> >>> svd_model.V</span> |
| <span class="sd"> DenseMatrix(3, 2, [-0.4082, -0.8165, -0.4082, 0.8944, -0.4472, 0.0], 0)</span> |
| <span class="sd"> """</span> |
| <span class="n">j_model</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span> |
| <span class="s2">"computeSVD"</span><span class="p">,</span> <span class="nb">int</span><span class="p">(</span><span class="n">k</span><span class="p">),</span> <span class="nb">bool</span><span class="p">(</span><span class="n">computeU</span><span class="p">),</span> <span class="nb">float</span><span class="p">(</span><span class="n">rCond</span><span class="p">))</span> |
| <span class="k">return</span> <span class="n">SingularValueDecomposition</span><span class="p">(</span><span class="n">j_model</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="IndexedRowMatrix.multiply"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.IndexedRowMatrix.html#pyspark.mllib.linalg.distributed.IndexedRowMatrix.multiply">[docs]</a> <span class="k">def</span> <span class="nf">multiply</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">matrix</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Multiply this matrix by a local dense matrix on the right.</span> |
| |
| <span class="sd"> .. versionadded:: 2.2.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> matrix : :py:class:`pyspark.mllib.linalg.Matrix`</span> |
| <span class="sd"> a local dense matrix whose number of rows must match the number of columns</span> |
| <span class="sd"> of this matrix</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :py:class:`IndexedRowMatrix`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> mat = IndexedRowMatrix(sc.parallelize([(0, (0, 1)), (1, (2, 3))]))</span> |
| <span class="sd"> >>> mat.multiply(DenseMatrix(2, 2, [0, 2, 1, 3])).rows.collect()</span> |
| <span class="sd"> [IndexedRow(0, [2.0,3.0]), IndexedRow(1, [6.0,11.0])]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">matrix</span><span class="p">,</span> <span class="n">DenseMatrix</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Only multiplication with DenseMatrix is supported."</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">IndexedRowMatrix</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"multiply"</span><span class="p">,</span> <span class="n">matrix</span><span class="p">))</span></div></div> |
| |
| |
| <div class="viewcode-block" id="MatrixEntry"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.MatrixEntry.html#pyspark.mllib.linalg.distributed.MatrixEntry">[docs]</a><span class="k">class</span> <span class="nc">MatrixEntry</span><span class="p">(</span><span class="nb">object</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Represents an entry of a CoordinateMatrix.</span> |
| |
| <span class="sd"> Just a wrapper over a (int, int, float) tuple.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> i : int</span> |
| <span class="sd"> The row index of the matrix.</span> |
| <span class="sd"> j : int</span> |
| <span class="sd"> The column index of the matrix.</span> |
| <span class="sd"> value : float</span> |
| <span class="sd"> The (i, j)th entry of the matrix, as a float.</span> |
| <span class="sd"> """</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">i</span><span class="p">,</span> <span class="n">j</span><span class="p">,</span> <span class="n">value</span><span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">i</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">j</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">j</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">value</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="k">return</span> <span class="s2">"MatrixEntry(</span><span class="si">%s</span><span class="s2">, </span><span class="si">%s</span><span class="s2">, </span><span class="si">%s</span><span class="s2">)"</span> <span class="o">%</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">i</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">j</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">value</span><span class="p">)</span></div> |
| |
| |
| <span class="k">def</span> <span class="nf">_convert_to_matrix_entry</span><span class="p">(</span><span class="n">entry</span><span class="p">):</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">entry</span><span class="p">,</span> <span class="n">MatrixEntry</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">entry</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">entry</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">entry</span><span class="p">)</span> <span class="o">==</span> <span class="mi">3</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">MatrixEntry</span><span class="p">(</span><span class="o">*</span><span class="n">entry</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Cannot convert type </span><span class="si">%s</span><span class="s2"> into MatrixEntry"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">entry</span><span class="p">))</span> |
| |
| |
| <div class="viewcode-block" id="CoordinateMatrix"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.CoordinateMatrix.html#pyspark.mllib.linalg.distributed.CoordinateMatrix">[docs]</a><span class="k">class</span> <span class="nc">CoordinateMatrix</span><span class="p">(</span><span class="n">DistributedMatrix</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Represents a matrix in coordinate format.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> entries : :py:class:`pyspark.RDD`</span> |
| <span class="sd"> An RDD of MatrixEntry inputs or</span> |
| <span class="sd"> (int, int, float) tuples.</span> |
| <span class="sd"> numRows : int, optional</span> |
| <span class="sd"> Number of rows in the matrix. A non-positive</span> |
| <span class="sd"> value means unknown, at which point the number</span> |
| <span class="sd"> of rows will be determined by the max row</span> |
| <span class="sd"> index plus one.</span> |
| <span class="sd"> numCols : int, optional</span> |
| <span class="sd"> Number of columns in the matrix. A non-positive</span> |
| <span class="sd"> value means unknown, at which point the number</span> |
| <span class="sd"> of columns will be determined by the max row</span> |
| <span class="sd"> index plus one.</span> |
| <span class="sd"> """</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">entries</span><span class="p">,</span> <span class="n">numRows</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">numCols</span><span class="o">=</span><span class="mi">0</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Note: This docstring is not shown publicly.</span> |
| |
| <span class="sd"> Create a wrapper over a Java CoordinateMatrix.</span> |
| |
| <span class="sd"> Publicly, we require that `rows` be an RDD. However, for</span> |
| <span class="sd"> internal usage, `rows` can also be a Java CoordinateMatrix</span> |
| <span class="sd"> object, in which case we can wrap it directly. This</span> |
| <span class="sd"> assists in clean matrix conversions.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2),</span> |
| <span class="sd"> ... MatrixEntry(6, 4, 2.1)])</span> |
| <span class="sd"> >>> mat = CoordinateMatrix(entries)</span> |
| |
| <span class="sd"> >>> mat_diff = CoordinateMatrix(entries)</span> |
| <span class="sd"> >>> (mat_diff._java_matrix_wrapper._java_model ==</span> |
| <span class="sd"> ... mat._java_matrix_wrapper._java_model)</span> |
| <span class="sd"> False</span> |
| |
| <span class="sd"> >>> mat_same = CoordinateMatrix(mat._java_matrix_wrapper._java_model)</span> |
| <span class="sd"> >>> (mat_same._java_matrix_wrapper._java_model ==</span> |
| <span class="sd"> ... mat._java_matrix_wrapper._java_model)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">entries</span><span class="p">,</span> <span class="n">RDD</span><span class="p">):</span> |
| <span class="n">entries</span> <span class="o">=</span> <span class="n">entries</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">_convert_to_matrix_entry</span><span class="p">)</span> |
| <span class="c1"># We use DataFrames for serialization of MatrixEntry entries</span> |
| <span class="c1"># from Python, so first convert the RDD to a DataFrame on</span> |
| <span class="c1"># this side. This will convert each MatrixEntry to a Row</span> |
| <span class="c1"># containing the 'i', 'j', and 'value' values, which can</span> |
| <span class="c1"># each be easily serialized. We will convert back to</span> |
| <span class="c1"># MatrixEntry inputs on the Scala side.</span> |
| <span class="n">java_matrix</span> <span class="o">=</span> <span class="n">callMLlibFunc</span><span class="p">(</span><span class="s2">"createCoordinateMatrix"</span><span class="p">,</span> <span class="n">entries</span><span class="o">.</span><span class="n">toDF</span><span class="p">(),</span> |
| <span class="nb">int</span><span class="p">(</span><span class="n">numRows</span><span class="p">),</span> <span class="nb">int</span><span class="p">(</span><span class="n">numCols</span><span class="p">))</span> |
| <span class="k">elif</span> <span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">entries</span><span class="p">,</span> <span class="n">JavaObject</span><span class="p">)</span> |
| <span class="ow">and</span> <span class="n">entries</span><span class="o">.</span><span class="n">getClass</span><span class="p">()</span><span class="o">.</span><span class="n">getSimpleName</span><span class="p">()</span> <span class="o">==</span> <span class="s2">"CoordinateMatrix"</span><span class="p">):</span> |
| <span class="n">java_matrix</span> <span class="o">=</span> <span class="n">entries</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"entries should be an RDD of MatrixEntry entries or "</span> |
| <span class="s2">"(int, int, float) tuples, got </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">entries</span><span class="p">))</span> |
| |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span> <span class="o">=</span> <span class="n">JavaModelWrapper</span><span class="p">(</span><span class="n">java_matrix</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">entries</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Entries of the CoordinateMatrix stored as an RDD of</span> |
| <span class="sd"> MatrixEntries.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> mat = CoordinateMatrix(sc.parallelize([MatrixEntry(0, 0, 1.2),</span> |
| <span class="sd"> ... MatrixEntry(6, 4, 2.1)]))</span> |
| <span class="sd"> >>> entries = mat.entries</span> |
| <span class="sd"> >>> entries.first()</span> |
| <span class="sd"> MatrixEntry(0, 0, 1.2)</span> |
| <span class="sd"> """</span> |
| <span class="c1"># We use DataFrames for serialization of MatrixEntry entries</span> |
| <span class="c1"># from Java, so we first convert the RDD of entries to a</span> |
| <span class="c1"># DataFrame on the Scala/Java side. Then we map each Row in</span> |
| <span class="c1"># the DataFrame back to a MatrixEntry on this side.</span> |
| <span class="n">entries_df</span> <span class="o">=</span> <span class="n">callMLlibFunc</span><span class="p">(</span><span class="s2">"getMatrixEntries"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">_java_model</span><span class="p">)</span> |
| <span class="n">entries</span> <span class="o">=</span> <span class="n">entries_df</span><span class="o">.</span><span class="n">rdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">row</span><span class="p">:</span> <span class="n">MatrixEntry</span><span class="p">(</span><span class="n">row</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">row</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">row</span><span class="p">[</span><span class="mi">2</span><span class="p">]))</span> |
| <span class="k">return</span> <span class="n">entries</span> |
| |
| <div class="viewcode-block" id="CoordinateMatrix.numRows"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.CoordinateMatrix.html#pyspark.mllib.linalg.distributed.CoordinateMatrix.numRows">[docs]</a> <span class="k">def</span> <span class="nf">numRows</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Get or compute the number of rows.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2),</span> |
| <span class="sd"> ... MatrixEntry(1, 0, 2),</span> |
| <span class="sd"> ... MatrixEntry(2, 1, 3.7)])</span> |
| |
| <span class="sd"> >>> mat = CoordinateMatrix(entries)</span> |
| <span class="sd"> >>> print(mat.numRows())</span> |
| <span class="sd"> 3</span> |
| |
| <span class="sd"> >>> mat = CoordinateMatrix(entries, 7, 6)</span> |
| <span class="sd"> >>> print(mat.numRows())</span> |
| <span class="sd"> 7</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"numRows"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="CoordinateMatrix.numCols"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.CoordinateMatrix.html#pyspark.mllib.linalg.distributed.CoordinateMatrix.numCols">[docs]</a> <span class="k">def</span> <span class="nf">numCols</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Get or compute the number of cols.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2),</span> |
| <span class="sd"> ... MatrixEntry(1, 0, 2),</span> |
| <span class="sd"> ... MatrixEntry(2, 1, 3.7)])</span> |
| |
| <span class="sd"> >>> mat = CoordinateMatrix(entries)</span> |
| <span class="sd"> >>> print(mat.numCols())</span> |
| <span class="sd"> 2</span> |
| |
| <span class="sd"> >>> mat = CoordinateMatrix(entries, 7, 6)</span> |
| <span class="sd"> >>> print(mat.numCols())</span> |
| <span class="sd"> 6</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"numCols"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="CoordinateMatrix.transpose"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.CoordinateMatrix.html#pyspark.mllib.linalg.distributed.CoordinateMatrix.transpose">[docs]</a> <span class="k">def</span> <span class="nf">transpose</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Transpose this CoordinateMatrix.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2),</span> |
| <span class="sd"> ... MatrixEntry(1, 0, 2),</span> |
| <span class="sd"> ... MatrixEntry(2, 1, 3.7)])</span> |
| <span class="sd"> >>> mat = CoordinateMatrix(entries)</span> |
| <span class="sd"> >>> mat_transposed = mat.transpose()</span> |
| |
| <span class="sd"> >>> print(mat_transposed.numRows())</span> |
| <span class="sd"> 2</span> |
| |
| <span class="sd"> >>> print(mat_transposed.numCols())</span> |
| <span class="sd"> 3</span> |
| <span class="sd"> """</span> |
| <span class="n">java_transposed_matrix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"transpose"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">CoordinateMatrix</span><span class="p">(</span><span class="n">java_transposed_matrix</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="CoordinateMatrix.toRowMatrix"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.CoordinateMatrix.html#pyspark.mllib.linalg.distributed.CoordinateMatrix.toRowMatrix">[docs]</a> <span class="k">def</span> <span class="nf">toRowMatrix</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Convert this matrix to a RowMatrix.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2),</span> |
| <span class="sd"> ... MatrixEntry(6, 4, 2.1)])</span> |
| <span class="sd"> >>> mat = CoordinateMatrix(entries).toRowMatrix()</span> |
| |
| <span class="sd"> >>> # This CoordinateMatrix will have 7 effective rows, due to</span> |
| <span class="sd"> >>> # the highest row index being 6, but the ensuing RowMatrix</span> |
| <span class="sd"> >>> # will only have 2 rows since there are only entries on 2</span> |
| <span class="sd"> >>> # unique rows.</span> |
| <span class="sd"> >>> print(mat.numRows())</span> |
| <span class="sd"> 2</span> |
| |
| <span class="sd"> >>> # This CoordinateMatrix will have 5 columns, due to the</span> |
| <span class="sd"> >>> # highest column index being 4, and the ensuing RowMatrix</span> |
| <span class="sd"> >>> # will have 5 columns as well.</span> |
| <span class="sd"> >>> print(mat.numCols())</span> |
| <span class="sd"> 5</span> |
| <span class="sd"> """</span> |
| <span class="n">java_row_matrix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"toRowMatrix"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">RowMatrix</span><span class="p">(</span><span class="n">java_row_matrix</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="CoordinateMatrix.toIndexedRowMatrix"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.CoordinateMatrix.html#pyspark.mllib.linalg.distributed.CoordinateMatrix.toIndexedRowMatrix">[docs]</a> <span class="k">def</span> <span class="nf">toIndexedRowMatrix</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Convert this matrix to an IndexedRowMatrix.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2),</span> |
| <span class="sd"> ... MatrixEntry(6, 4, 2.1)])</span> |
| <span class="sd"> >>> mat = CoordinateMatrix(entries).toIndexedRowMatrix()</span> |
| |
| <span class="sd"> >>> # This CoordinateMatrix will have 7 effective rows, due to</span> |
| <span class="sd"> >>> # the highest row index being 6, and the ensuing</span> |
| <span class="sd"> >>> # IndexedRowMatrix will have 7 rows as well.</span> |
| <span class="sd"> >>> print(mat.numRows())</span> |
| <span class="sd"> 7</span> |
| |
| <span class="sd"> >>> # This CoordinateMatrix will have 5 columns, due to the</span> |
| <span class="sd"> >>> # highest column index being 4, and the ensuing</span> |
| <span class="sd"> >>> # IndexedRowMatrix will have 5 columns as well.</span> |
| <span class="sd"> >>> print(mat.numCols())</span> |
| <span class="sd"> 5</span> |
| <span class="sd"> """</span> |
| <span class="n">java_indexed_row_matrix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"toIndexedRowMatrix"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">IndexedRowMatrix</span><span class="p">(</span><span class="n">java_indexed_row_matrix</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="CoordinateMatrix.toBlockMatrix"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.CoordinateMatrix.html#pyspark.mllib.linalg.distributed.CoordinateMatrix.toBlockMatrix">[docs]</a> <span class="k">def</span> <span class="nf">toBlockMatrix</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">rowsPerBlock</span><span class="o">=</span><span class="mi">1024</span><span class="p">,</span> <span class="n">colsPerBlock</span><span class="o">=</span><span class="mi">1024</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Convert this matrix to a BlockMatrix.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> rowsPerBlock : int, optional</span> |
| <span class="sd"> Number of rows that make up each block.</span> |
| <span class="sd"> The blocks forming the final rows are not</span> |
| <span class="sd"> required to have the given number of rows.</span> |
| <span class="sd"> colsPerBlock : int, optional</span> |
| <span class="sd"> Number of columns that make up each block.</span> |
| <span class="sd"> The blocks forming the final columns are not</span> |
| <span class="sd"> required to have the given number of columns.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> entries = sc.parallelize([MatrixEntry(0, 0, 1.2),</span> |
| <span class="sd"> ... MatrixEntry(6, 4, 2.1)])</span> |
| <span class="sd"> >>> mat = CoordinateMatrix(entries).toBlockMatrix()</span> |
| |
| <span class="sd"> >>> # This CoordinateMatrix will have 7 effective rows, due to</span> |
| <span class="sd"> >>> # the highest row index being 6, and the ensuing</span> |
| <span class="sd"> >>> # BlockMatrix will have 7 rows as well.</span> |
| <span class="sd"> >>> print(mat.numRows())</span> |
| <span class="sd"> 7</span> |
| |
| <span class="sd"> >>> # This CoordinateMatrix will have 5 columns, due to the</span> |
| <span class="sd"> >>> # highest column index being 4, and the ensuing</span> |
| <span class="sd"> >>> # BlockMatrix will have 5 columns as well.</span> |
| <span class="sd"> >>> print(mat.numCols())</span> |
| <span class="sd"> 5</span> |
| <span class="sd"> """</span> |
| <span class="n">java_block_matrix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"toBlockMatrix"</span><span class="p">,</span> |
| <span class="n">rowsPerBlock</span><span class="p">,</span> |
| <span class="n">colsPerBlock</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">java_block_matrix</span><span class="p">,</span> <span class="n">rowsPerBlock</span><span class="p">,</span> <span class="n">colsPerBlock</span><span class="p">)</span></div></div> |
| |
| |
| <span class="k">def</span> <span class="nf">_convert_to_matrix_block_tuple</span><span class="p">(</span><span class="n">block</span><span class="p">):</span> |
| <span class="k">if</span> <span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">block</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">block</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2</span> |
| <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">block</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="nb">tuple</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">block</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="o">==</span> <span class="mi">2</span> |
| <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">block</span><span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="n">Matrix</span><span class="p">)):</span> |
| <span class="n">blockRowIndex</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">block</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">])</span> |
| <span class="n">blockColIndex</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">block</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="mi">1</span><span class="p">])</span> |
| <span class="n">subMatrix</span> <span class="o">=</span> <span class="n">block</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> |
| <span class="k">return</span> <span class="p">((</span><span class="n">blockRowIndex</span><span class="p">,</span> <span class="n">blockColIndex</span><span class="p">),</span> <span class="n">subMatrix</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Cannot convert type </span><span class="si">%s</span><span class="s2"> into a sub-matrix block tuple"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">block</span><span class="p">))</span> |
| |
| |
| <div class="viewcode-block" id="BlockMatrix"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.BlockMatrix.html#pyspark.mllib.linalg.distributed.BlockMatrix">[docs]</a><span class="k">class</span> <span class="nc">BlockMatrix</span><span class="p">(</span><span class="n">DistributedMatrix</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Represents a distributed matrix in blocks of local matrices.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> blocks : :py:class:`pyspark.RDD`</span> |
| <span class="sd"> An RDD of sub-matrix blocks</span> |
| <span class="sd"> ((blockRowIndex, blockColIndex), sub-matrix) that</span> |
| <span class="sd"> form this distributed matrix. If multiple blocks</span> |
| <span class="sd"> with the same index exist, the results for</span> |
| <span class="sd"> operations like add and multiply will be</span> |
| <span class="sd"> unpredictable.</span> |
| <span class="sd"> rowsPerBlock : int</span> |
| <span class="sd"> Number of rows that make up each block.</span> |
| <span class="sd"> The blocks forming the final rows are not</span> |
| <span class="sd"> required to have the given number of rows.</span> |
| <span class="sd"> colsPerBlock : int</span> |
| <span class="sd"> Number of columns that make up each block.</span> |
| <span class="sd"> The blocks forming the final columns are not</span> |
| <span class="sd"> required to have the given number of columns.</span> |
| <span class="sd"> numRows : int, optional</span> |
| <span class="sd"> Number of rows of this matrix. If the supplied</span> |
| <span class="sd"> value is less than or equal to zero, the number</span> |
| <span class="sd"> of rows will be calculated when `numRows` is</span> |
| <span class="sd"> invoked.</span> |
| <span class="sd"> numCols : int, optional</span> |
| <span class="sd"> Number of columns of this matrix. If the supplied</span> |
| <span class="sd"> value is less than or equal to zero, the number</span> |
| <span class="sd"> of columns will be calculated when `numCols` is</span> |
| <span class="sd"> invoked.</span> |
| <span class="sd"> """</span> |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">blocks</span><span class="p">,</span> <span class="n">rowsPerBlock</span><span class="p">,</span> <span class="n">colsPerBlock</span><span class="p">,</span> <span class="n">numRows</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">numCols</span><span class="o">=</span><span class="mi">0</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Note: This docstring is not shown publicly.</span> |
| |
| <span class="sd"> Create a wrapper over a Java BlockMatrix.</span> |
| |
| <span class="sd"> Publicly, we require that `blocks` be an RDD. However, for</span> |
| <span class="sd"> internal usage, `blocks` can also be a Java BlockMatrix</span> |
| <span class="sd"> object, in which case we can wrap it directly. This</span> |
| <span class="sd"> assists in clean matrix conversions.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),</span> |
| <span class="sd"> ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])</span> |
| <span class="sd"> >>> mat = BlockMatrix(blocks, 3, 2)</span> |
| |
| <span class="sd"> >>> mat_diff = BlockMatrix(blocks, 3, 2)</span> |
| <span class="sd"> >>> (mat_diff._java_matrix_wrapper._java_model ==</span> |
| <span class="sd"> ... mat._java_matrix_wrapper._java_model)</span> |
| <span class="sd"> False</span> |
| |
| <span class="sd"> >>> mat_same = BlockMatrix(mat._java_matrix_wrapper._java_model, 3, 2)</span> |
| <span class="sd"> >>> (mat_same._java_matrix_wrapper._java_model ==</span> |
| <span class="sd"> ... mat._java_matrix_wrapper._java_model)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">blocks</span><span class="p">,</span> <span class="n">RDD</span><span class="p">):</span> |
| <span class="n">blocks</span> <span class="o">=</span> <span class="n">blocks</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">_convert_to_matrix_block_tuple</span><span class="p">)</span> |
| <span class="c1"># We use DataFrames for serialization of sub-matrix blocks</span> |
| <span class="c1"># from Python, so first convert the RDD to a DataFrame on</span> |
| <span class="c1"># this side. This will convert each sub-matrix block</span> |
| <span class="c1"># tuple to a Row containing the 'blockRowIndex',</span> |
| <span class="c1"># 'blockColIndex', and 'subMatrix' values, which can</span> |
| <span class="c1"># each be easily serialized. We will convert back to</span> |
| <span class="c1"># ((blockRowIndex, blockColIndex), sub-matrix) tuples on</span> |
| <span class="c1"># the Scala side.</span> |
| <span class="n">java_matrix</span> <span class="o">=</span> <span class="n">callMLlibFunc</span><span class="p">(</span><span class="s2">"createBlockMatrix"</span><span class="p">,</span> <span class="n">blocks</span><span class="o">.</span><span class="n">toDF</span><span class="p">(),</span> |
| <span class="nb">int</span><span class="p">(</span><span class="n">rowsPerBlock</span><span class="p">),</span> <span class="nb">int</span><span class="p">(</span><span class="n">colsPerBlock</span><span class="p">),</span> |
| <span class="nb">int</span><span class="p">(</span><span class="n">numRows</span><span class="p">),</span> <span class="nb">int</span><span class="p">(</span><span class="n">numCols</span><span class="p">))</span> |
| <span class="k">elif</span> <span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">blocks</span><span class="p">,</span> <span class="n">JavaObject</span><span class="p">)</span> |
| <span class="ow">and</span> <span class="n">blocks</span><span class="o">.</span><span class="n">getClass</span><span class="p">()</span><span class="o">.</span><span class="n">getSimpleName</span><span class="p">()</span> <span class="o">==</span> <span class="s2">"BlockMatrix"</span><span class="p">):</span> |
| <span class="n">java_matrix</span> <span class="o">=</span> <span class="n">blocks</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"blocks should be an RDD of sub-matrix blocks as "</span> |
| <span class="s2">"((int, int), matrix) tuples, got </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">blocks</span><span class="p">))</span> |
| |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span> <span class="o">=</span> <span class="n">JavaModelWrapper</span><span class="p">(</span><span class="n">java_matrix</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">blocks</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> The RDD of sub-matrix blocks</span> |
| <span class="sd"> ((blockRowIndex, blockColIndex), sub-matrix) that form this</span> |
| <span class="sd"> distributed matrix.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> mat = BlockMatrix(</span> |
| <span class="sd"> ... sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),</span> |
| <span class="sd"> ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]), 3, 2)</span> |
| <span class="sd"> >>> blocks = mat.blocks</span> |
| <span class="sd"> >>> blocks.first()</span> |
| <span class="sd"> ((0, 0), DenseMatrix(3, 2, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], 0))</span> |
| |
| <span class="sd"> """</span> |
| <span class="c1"># We use DataFrames for serialization of sub-matrix blocks</span> |
| <span class="c1"># from Java, so we first convert the RDD of blocks to a</span> |
| <span class="c1"># DataFrame on the Scala/Java side. Then we map each Row in</span> |
| <span class="c1"># the DataFrame back to a sub-matrix block on this side.</span> |
| <span class="n">blocks_df</span> <span class="o">=</span> <span class="n">callMLlibFunc</span><span class="p">(</span><span class="s2">"getMatrixBlocks"</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">_java_model</span><span class="p">)</span> |
| <span class="n">blocks</span> <span class="o">=</span> <span class="n">blocks_df</span><span class="o">.</span><span class="n">rdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">row</span><span class="p">:</span> <span class="p">((</span><span class="n">row</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">],</span> <span class="n">row</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="mi">1</span><span class="p">]),</span> <span class="n">row</span><span class="p">[</span><span class="mi">1</span><span class="p">]))</span> |
| <span class="k">return</span> <span class="n">blocks</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">rowsPerBlock</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Number of rows that make up each block.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),</span> |
| <span class="sd"> ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])</span> |
| <span class="sd"> >>> mat = BlockMatrix(blocks, 3, 2)</span> |
| <span class="sd"> >>> mat.rowsPerBlock</span> |
| <span class="sd"> 3</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"rowsPerBlock"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">colsPerBlock</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Number of columns that make up each block.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),</span> |
| <span class="sd"> ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])</span> |
| <span class="sd"> >>> mat = BlockMatrix(blocks, 3, 2)</span> |
| <span class="sd"> >>> mat.colsPerBlock</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"colsPerBlock"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">numRowBlocks</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Number of rows of blocks in the BlockMatrix.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),</span> |
| <span class="sd"> ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])</span> |
| <span class="sd"> >>> mat = BlockMatrix(blocks, 3, 2)</span> |
| <span class="sd"> >>> mat.numRowBlocks</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"numRowBlocks"</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">numColBlocks</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Number of columns of blocks in the BlockMatrix.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),</span> |
| <span class="sd"> ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])</span> |
| <span class="sd"> >>> mat = BlockMatrix(blocks, 3, 2)</span> |
| <span class="sd"> >>> mat.numColBlocks</span> |
| <span class="sd"> 1</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"numColBlocks"</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="BlockMatrix.numRows"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.BlockMatrix.html#pyspark.mllib.linalg.distributed.BlockMatrix.numRows">[docs]</a> <span class="k">def</span> <span class="nf">numRows</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Get or compute the number of rows.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),</span> |
| <span class="sd"> ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])</span> |
| |
| <span class="sd"> >>> mat = BlockMatrix(blocks, 3, 2)</span> |
| <span class="sd"> >>> print(mat.numRows())</span> |
| <span class="sd"> 6</span> |
| |
| <span class="sd"> >>> mat = BlockMatrix(blocks, 3, 2, 7, 6)</span> |
| <span class="sd"> >>> print(mat.numRows())</span> |
| <span class="sd"> 7</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"numRows"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BlockMatrix.numCols"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.BlockMatrix.html#pyspark.mllib.linalg.distributed.BlockMatrix.numCols">[docs]</a> <span class="k">def</span> <span class="nf">numCols</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Get or compute the number of cols.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),</span> |
| <span class="sd"> ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])</span> |
| |
| <span class="sd"> >>> mat = BlockMatrix(blocks, 3, 2)</span> |
| <span class="sd"> >>> print(mat.numCols())</span> |
| <span class="sd"> 2</span> |
| |
| <span class="sd"> >>> mat = BlockMatrix(blocks, 3, 2, 7, 6)</span> |
| <span class="sd"> >>> print(mat.numCols())</span> |
| <span class="sd"> 6</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"numCols"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BlockMatrix.cache"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.BlockMatrix.html#pyspark.mllib.linalg.distributed.BlockMatrix.cache">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s1">'2.0.0'</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">cache</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Caches the underlying RDD.</span> |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"cache"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span></div> |
| |
| <div class="viewcode-block" id="BlockMatrix.persist"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.BlockMatrix.html#pyspark.mllib.linalg.distributed.BlockMatrix.persist">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s1">'2.0.0'</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">persist</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">storageLevel</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Persists the underlying RDD with the specified storage level.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">storageLevel</span><span class="p">,</span> <span class="n">StorageLevel</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"`storageLevel` should be a StorageLevel, got </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">storageLevel</span><span class="p">))</span> |
| <span class="n">javaStorageLevel</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">_sc</span><span class="o">.</span><span class="n">_getJavaStorageLevel</span><span class="p">(</span><span class="n">storageLevel</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"persist"</span><span class="p">,</span> <span class="n">javaStorageLevel</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span></div> |
| |
| <div class="viewcode-block" id="BlockMatrix.validate"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.BlockMatrix.html#pyspark.mllib.linalg.distributed.BlockMatrix.validate">[docs]</a> <span class="nd">@since</span><span class="p">(</span><span class="s1">'2.0.0'</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">validate</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Validates the block matrix info against the matrix data (`blocks`)</span> |
| <span class="sd"> and throws an exception if any error is found.</span> |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"validate"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BlockMatrix.add"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.BlockMatrix.html#pyspark.mllib.linalg.distributed.BlockMatrix.add">[docs]</a> <span class="k">def</span> <span class="nf">add</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Adds two block matrices together. The matrices must have the</span> |
| <span class="sd"> same size and matching `rowsPerBlock` and `colsPerBlock` values.</span> |
| <span class="sd"> If one of the sub matrix blocks that are being added is a</span> |
| <span class="sd"> SparseMatrix, the resulting sub matrix block will also be a</span> |
| <span class="sd"> SparseMatrix, even if it is being added to a DenseMatrix. If</span> |
| <span class="sd"> two dense sub matrix blocks are added, the output block will</span> |
| <span class="sd"> also be a DenseMatrix.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> dm1 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])</span> |
| <span class="sd"> >>> dm2 = Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12])</span> |
| <span class="sd"> >>> sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 1, 2], [7, 11, 12])</span> |
| <span class="sd"> >>> blocks1 = sc.parallelize([((0, 0), dm1), ((1, 0), dm2)])</span> |
| <span class="sd"> >>> blocks2 = sc.parallelize([((0, 0), dm1), ((1, 0), dm2)])</span> |
| <span class="sd"> >>> blocks3 = sc.parallelize([((0, 0), sm), ((1, 0), dm2)])</span> |
| <span class="sd"> >>> mat1 = BlockMatrix(blocks1, 3, 2)</span> |
| <span class="sd"> >>> mat2 = BlockMatrix(blocks2, 3, 2)</span> |
| <span class="sd"> >>> mat3 = BlockMatrix(blocks3, 3, 2)</span> |
| |
| <span class="sd"> >>> mat1.add(mat2).toLocalMatrix()</span> |
| <span class="sd"> DenseMatrix(6, 2, [2.0, 4.0, 6.0, 14.0, 16.0, 18.0, 8.0, 10.0, 12.0, 20.0, 22.0, 24.0], 0)</span> |
| |
| <span class="sd"> >>> mat1.add(mat3).toLocalMatrix()</span> |
| <span class="sd"> DenseMatrix(6, 2, [8.0, 2.0, 3.0, 14.0, 16.0, 18.0, 4.0, 16.0, 18.0, 20.0, 22.0, 24.0], 0)</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">BlockMatrix</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Other should be a BlockMatrix, got </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">other</span><span class="p">))</span> |
| |
| <span class="n">other_java_block_matrix</span> <span class="o">=</span> <span class="n">other</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">_java_model</span> |
| <span class="n">java_block_matrix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"add"</span><span class="p">,</span> <span class="n">other_java_block_matrix</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">java_block_matrix</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">rowsPerBlock</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">colsPerBlock</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BlockMatrix.subtract"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.BlockMatrix.html#pyspark.mllib.linalg.distributed.BlockMatrix.subtract">[docs]</a> <span class="k">def</span> <span class="nf">subtract</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Subtracts the given block matrix `other` from this block matrix:</span> |
| <span class="sd"> `this - other`. The matrices must have the same size and</span> |
| <span class="sd"> matching `rowsPerBlock` and `colsPerBlock` values. If one of</span> |
| <span class="sd"> the sub matrix blocks that are being subtracted is a</span> |
| <span class="sd"> SparseMatrix, the resulting sub matrix block will also be a</span> |
| <span class="sd"> SparseMatrix, even if it is being subtracted from a DenseMatrix.</span> |
| <span class="sd"> If two dense sub matrix blocks are subtracted, the output block</span> |
| <span class="sd"> will also be a DenseMatrix.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> dm1 = Matrices.dense(3, 2, [3, 1, 5, 4, 6, 2])</span> |
| <span class="sd"> >>> dm2 = Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12])</span> |
| <span class="sd"> >>> sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 1, 2], [1, 2, 3])</span> |
| <span class="sd"> >>> blocks1 = sc.parallelize([((0, 0), dm1), ((1, 0), dm2)])</span> |
| <span class="sd"> >>> blocks2 = sc.parallelize([((0, 0), dm2), ((1, 0), dm1)])</span> |
| <span class="sd"> >>> blocks3 = sc.parallelize([((0, 0), sm), ((1, 0), dm2)])</span> |
| <span class="sd"> >>> mat1 = BlockMatrix(blocks1, 3, 2)</span> |
| <span class="sd"> >>> mat2 = BlockMatrix(blocks2, 3, 2)</span> |
| <span class="sd"> >>> mat3 = BlockMatrix(blocks3, 3, 2)</span> |
| |
| <span class="sd"> >>> mat1.subtract(mat2).toLocalMatrix()</span> |
| <span class="sd"> DenseMatrix(6, 2, [-4.0, -7.0, -4.0, 4.0, 7.0, 4.0, -6.0, -5.0, -10.0, 6.0, 5.0, 10.0], 0)</span> |
| |
| <span class="sd"> >>> mat2.subtract(mat3).toLocalMatrix()</span> |
| <span class="sd"> DenseMatrix(6, 2, [6.0, 8.0, 9.0, -4.0, -7.0, -4.0, 10.0, 9.0, 9.0, -6.0, -5.0, -10.0], 0)</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">BlockMatrix</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Other should be a BlockMatrix, got </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">other</span><span class="p">))</span> |
| |
| <span class="n">other_java_block_matrix</span> <span class="o">=</span> <span class="n">other</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">_java_model</span> |
| <span class="n">java_block_matrix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"subtract"</span><span class="p">,</span> <span class="n">other_java_block_matrix</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">java_block_matrix</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">rowsPerBlock</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">colsPerBlock</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BlockMatrix.multiply"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.BlockMatrix.html#pyspark.mllib.linalg.distributed.BlockMatrix.multiply">[docs]</a> <span class="k">def</span> <span class="nf">multiply</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Left multiplies this BlockMatrix by `other`, another</span> |
| <span class="sd"> BlockMatrix. The `colsPerBlock` of this matrix must equal the</span> |
| <span class="sd"> `rowsPerBlock` of `other`. If `other` contains any SparseMatrix</span> |
| <span class="sd"> blocks, they will have to be converted to DenseMatrix blocks.</span> |
| <span class="sd"> The output BlockMatrix will only consist of DenseMatrix blocks.</span> |
| <span class="sd"> This may cause some performance issues until support for</span> |
| <span class="sd"> multiplying two sparse matrices is added.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> dm1 = Matrices.dense(2, 3, [1, 2, 3, 4, 5, 6])</span> |
| <span class="sd"> >>> dm2 = Matrices.dense(2, 3, [7, 8, 9, 10, 11, 12])</span> |
| <span class="sd"> >>> dm3 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])</span> |
| <span class="sd"> >>> dm4 = Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12])</span> |
| <span class="sd"> >>> sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 1, 2], [7, 11, 12])</span> |
| <span class="sd"> >>> blocks1 = sc.parallelize([((0, 0), dm1), ((0, 1), dm2)])</span> |
| <span class="sd"> >>> blocks2 = sc.parallelize([((0, 0), dm3), ((1, 0), dm4)])</span> |
| <span class="sd"> >>> blocks3 = sc.parallelize([((0, 0), sm), ((1, 0), dm4)])</span> |
| <span class="sd"> >>> mat1 = BlockMatrix(blocks1, 2, 3)</span> |
| <span class="sd"> >>> mat2 = BlockMatrix(blocks2, 3, 2)</span> |
| <span class="sd"> >>> mat3 = BlockMatrix(blocks3, 3, 2)</span> |
| |
| <span class="sd"> >>> mat1.multiply(mat2).toLocalMatrix()</span> |
| <span class="sd"> DenseMatrix(2, 2, [242.0, 272.0, 350.0, 398.0], 0)</span> |
| |
| <span class="sd"> >>> mat1.multiply(mat3).toLocalMatrix()</span> |
| <span class="sd"> DenseMatrix(2, 2, [227.0, 258.0, 394.0, 450.0], 0)</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">BlockMatrix</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Other should be a BlockMatrix, got </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">other</span><span class="p">))</span> |
| |
| <span class="n">other_java_block_matrix</span> <span class="o">=</span> <span class="n">other</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">_java_model</span> |
| <span class="n">java_block_matrix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"multiply"</span><span class="p">,</span> <span class="n">other_java_block_matrix</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">java_block_matrix</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">rowsPerBlock</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">colsPerBlock</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BlockMatrix.transpose"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.BlockMatrix.html#pyspark.mllib.linalg.distributed.BlockMatrix.transpose">[docs]</a> <span class="k">def</span> <span class="nf">transpose</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Transpose this BlockMatrix. Returns a new BlockMatrix</span> |
| <span class="sd"> instance sharing the same underlying data. Is a lazy operation.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),</span> |
| <span class="sd"> ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])</span> |
| <span class="sd"> >>> mat = BlockMatrix(blocks, 3, 2)</span> |
| |
| <span class="sd"> >>> mat_transposed = mat.transpose()</span> |
| <span class="sd"> >>> mat_transposed.toLocalMatrix()</span> |
| <span class="sd"> DenseMatrix(2, 6, [1.0, 4.0, 2.0, 5.0, 3.0, 6.0, 7.0, 10.0, 8.0, 11.0, 9.0, 12.0], 0)</span> |
| <span class="sd"> """</span> |
| <span class="n">java_transposed_matrix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"transpose"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">BlockMatrix</span><span class="p">(</span><span class="n">java_transposed_matrix</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">colsPerBlock</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">rowsPerBlock</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BlockMatrix.toLocalMatrix"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.BlockMatrix.html#pyspark.mllib.linalg.distributed.BlockMatrix.toLocalMatrix">[docs]</a> <span class="k">def</span> <span class="nf">toLocalMatrix</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Collect the distributed matrix on the driver as a DenseMatrix.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),</span> |
| <span class="sd"> ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])</span> |
| <span class="sd"> >>> mat = BlockMatrix(blocks, 3, 2).toLocalMatrix()</span> |
| |
| <span class="sd"> >>> # This BlockMatrix will have 6 effective rows, due to</span> |
| <span class="sd"> >>> # having two sub-matrix blocks stacked, each with 3 rows.</span> |
| <span class="sd"> >>> # The ensuing DenseMatrix will also have 6 rows.</span> |
| <span class="sd"> >>> print(mat.numRows)</span> |
| <span class="sd"> 6</span> |
| |
| <span class="sd"> >>> # This BlockMatrix will have 2 effective columns, due to</span> |
| <span class="sd"> >>> # having two sub-matrix blocks stacked, each with 2</span> |
| <span class="sd"> >>> # columns. The ensuing DenseMatrix will also have 2 columns.</span> |
| <span class="sd"> >>> print(mat.numCols)</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"toLocalMatrix"</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BlockMatrix.toIndexedRowMatrix"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.BlockMatrix.html#pyspark.mllib.linalg.distributed.BlockMatrix.toIndexedRowMatrix">[docs]</a> <span class="k">def</span> <span class="nf">toIndexedRowMatrix</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Convert this matrix to an IndexedRowMatrix.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])),</span> |
| <span class="sd"> ... ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))])</span> |
| <span class="sd"> >>> mat = BlockMatrix(blocks, 3, 2).toIndexedRowMatrix()</span> |
| |
| <span class="sd"> >>> # This BlockMatrix will have 6 effective rows, due to</span> |
| <span class="sd"> >>> # having two sub-matrix blocks stacked, each with 3 rows.</span> |
| <span class="sd"> >>> # The ensuing IndexedRowMatrix will also have 6 rows.</span> |
| <span class="sd"> >>> print(mat.numRows())</span> |
| <span class="sd"> 6</span> |
| |
| <span class="sd"> >>> # This BlockMatrix will have 2 effective columns, due to</span> |
| <span class="sd"> >>> # having two sub-matrix blocks stacked, each with 2 columns.</span> |
| <span class="sd"> >>> # The ensuing IndexedRowMatrix will also have 2 columns.</span> |
| <span class="sd"> >>> print(mat.numCols())</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> """</span> |
| <span class="n">java_indexed_row_matrix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"toIndexedRowMatrix"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">IndexedRowMatrix</span><span class="p">(</span><span class="n">java_indexed_row_matrix</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="BlockMatrix.toCoordinateMatrix"><a class="viewcode-back" href="../../../../reference/api/pyspark.mllib.linalg.distributed.BlockMatrix.html#pyspark.mllib.linalg.distributed.BlockMatrix.toCoordinateMatrix">[docs]</a> <span class="k">def</span> <span class="nf">toCoordinateMatrix</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Convert this matrix to a CoordinateMatrix.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> blocks = sc.parallelize([((0, 0), Matrices.dense(1, 2, [1, 2])),</span> |
| <span class="sd"> ... ((1, 0), Matrices.dense(1, 2, [7, 8]))])</span> |
| <span class="sd"> >>> mat = BlockMatrix(blocks, 1, 2).toCoordinateMatrix()</span> |
| <span class="sd"> >>> mat.entries.take(3)</span> |
| <span class="sd"> [MatrixEntry(0, 0, 1.0), MatrixEntry(0, 1, 2.0), MatrixEntry(1, 0, 7.0)]</span> |
| <span class="sd"> """</span> |
| <span class="n">java_coordinate_matrix</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_java_matrix_wrapper</span><span class="o">.</span><span class="n">call</span><span class="p">(</span><span class="s2">"toCoordinateMatrix"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">CoordinateMatrix</span><span class="p">(</span><span class="n">java_coordinate_matrix</span><span class="p">)</span></div></div> |
| |
| |
| <span class="k">def</span> <span class="nf">_test</span><span class="p">():</span> |
| <span class="kn">import</span> <span class="nn">doctest</span> |
| <span class="kn">import</span> <span class="nn">numpy</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| <span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="kn">import</span> <span class="n">Matrices</span> |
| <span class="kn">import</span> <span class="nn">pyspark.mllib.linalg.distributed</span> |
| <span class="k">try</span><span class="p">:</span> |
| <span class="c1"># Numpy 1.14+ changed it's string format.</span> |
| <span class="n">numpy</span><span class="o">.</span><span class="n">set_printoptions</span><span class="p">(</span><span class="n">legacy</span><span class="o">=</span><span class="s1">'1.13'</span><span class="p">)</span> |
| <span class="k">except</span> <span class="ne">TypeError</span><span class="p">:</span> |
| <span class="k">pass</span> |
| <span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">mllib</span><span class="o">.</span><span class="n">linalg</span><span class="o">.</span><span class="n">distributed</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span>\ |
| <span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">"local[2]"</span><span class="p">)</span>\ |
| <span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">"mllib.linalg.distributed tests"</span><span class="p">)</span>\ |
| <span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s1">'sc'</span><span class="p">]</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">sparkContext</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s1">'Matrices'</span><span class="p">]</span> <span class="o">=</span> <span class="n">Matrices</span> |
| <span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span><span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span> <span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span><span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span> |
| <span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"__main__"</span><span class="p">:</span> |
| <span class="n">_test</span><span class="p">()</span> |
| </pre></div> |
| |
| </div> |
| |
| |
| <div class='prev-next-bottom'> |
| |
| |
| </div> |
| |
| </main> |
| |
| |
| </div> |
| </div> |
| |
| |
| <script src="../../../../_static/js/index.3da636dd464baa7582d2.js"></script> |
| |
| |
| <footer class="footer mt-5 mt-md-0"> |
| <div class="container"> |
| <p> |
| © Copyright .<br/> |
| Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/> |
| </p> |
| </div> |
| </footer> |
| </body> |
| </html> |