docs/latest/tutorials/cco-lastfm/cco-lastfm.scala - mahout - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
 */

 /*
  * Download data from: http://files.grouplens.org/datasets/hetrec2011/hetrec2011-lastfm-2k.zip
  * then run this in the mahout shell.
  */

 import org.apache.mahout.sparkbindings.indexeddataset.IndexedDatasetSpark

 // We need to turn our raw text files into RDD[(String, String)]
 val userTagsRDD = sc.textFile("/path/to/lastfm/user_taggedartists.dat").map(line => line.split("\t")).map(a => (a(0), a(2))).filter(_._1 != "userID")
 val userTagsIDS = IndexedDatasetSpark.apply(userTagsRDD)(sc)

 val userArtistsRDD = sc.textFile("/path/to/lastfm/user_artists.dat").map(line => line.split("\t")).map(a => (a(0), a(1))).filter(_._1 != "userID")
 val userArtistsIDS = IndexedDatasetSpark.apply(userArtistsRDD)(sc)

 val userFriendsRDD = sc.textFile("/path/to/data/lastfm/user_friends.dat").map(line => line.split("\t")).map(a => (a(0), a(1))).filter(_._1 != "userID")
 val userFriendsIDS = IndexedDatasetSpark.apply(userFriendsRDD)(sc)

 val primaryIDS = userFriendsIDS
 val secondaryActionRDDs = List(userArtistsRDD, userTagsRDD)

 import org.apache.mahout.math.indexeddataset.{IndexedDataset, BiDictionary}

 def adjustRowCardinality(rowCardinality: Integer, datasetA: IndexedDataset): IndexedDataset = {
   val returnedA = if (rowCardinality != datasetA.matrix.nrow) datasetA.newRowCardinality(rowCardinality)
   else datasetA // this guarantees matching cardinality

   returnedA
 }

 var rowCardinality = primaryIDS.rowIDs.size

 val secondaryActionIDS: Array[IndexedDataset] = new Array[IndexedDataset](secondaryActionRDDs.length)
 for (i <- secondaryActionRDDs.indices) {

   val bcPrimaryRowIDs = sc.broadcast(primaryIDS.rowIDs)
   bcPrimaryRowIDs.value

   val tempRDD = secondaryActionRDDs(i).filter(a => bcPrimaryRowIDs.value.contains(a._1))

   var tempIDS = IndexedDatasetSpark.apply(tempRDD, existingRowIDs = Some(primaryIDS.rowIDs))(sc)
   secondaryActionIDS(i) = adjustRowCardinality(rowCardinality,tempIDS)
 }

 import org.apache.mahout.math.cf.SimilarityAnalysis

 val artistReccosLlrDrmListByArtist = SimilarityAnalysis.cooccurrencesIDSs(
   Array(primaryIDS, secondaryActionIDS(0), secondaryActionIDS(1)),
   maxInterestingItemsPerThing = 20,
   maxNumInteractions = 500,
   randomSeed = 1234)
 // Anonymous User

 val artistMap = sc.textFile("/path/to/lastfm/artists.dat").map(line => line.split("\t")).map(a => (a(1), a(0))).filter(_._1 != "name").collect.toMap
 val tagsMap = sc.textFile("/path/to/lastfm/tags.dat").map(line => line.split("\t")).map(a => (a(1), a(0))).filter(_._1 != "tagValue").collect.toMap

 // Watch your skin- you're not wearing armour. (This will fail on misspelled artists
 // This is neccessary because the ids are integer-strings already, and for this demo I didn't want to chance them to Integer types (bc more often you'll have strings).
 val kilroyUserArtists = svec( (userArtistsIDS.columnIDs.get(artistMap("Beck")).get, 1) ::
   (userArtistsIDS.columnIDs.get(artistMap("David Bowie")).get, 1) ::
   (userArtistsIDS.columnIDs.get(artistMap("Gary Numan")).get, 1) ::
   (userArtistsIDS.columnIDs.get(artistMap("Less Than Jake")).get, 1) ::
   (userArtistsIDS.columnIDs.get(artistMap("Lou Reed")).get, 1) ::
   (userArtistsIDS.columnIDs.get(artistMap("Parliament")).get, 1) ::
   (userArtistsIDS.columnIDs.get(artistMap("Radiohead")).get, 1) ::
   (userArtistsIDS.columnIDs.get(artistMap("Seu Jorge")).get, 1) ::
   (userArtistsIDS.columnIDs.get(artistMap("The Skatalites")).get, 1) ::
   (userArtistsIDS.columnIDs.get(artistMap("Reverend Horton Heat")).get, 1) ::
   (userArtistsIDS.columnIDs.get(artistMap("Talking Heads")).get, 1) ::
   (userArtistsIDS.columnIDs.get(artistMap("Tom Waits")).get, 1) ::
   (userArtistsIDS.columnIDs.get(artistMap("Waylon Jennings")).get, 1) ::
   (userArtistsIDS.columnIDs.get(artistMap("Wu-Tang Clan")).get, 1) :: Nil, cardinality = userArtistsIDS.columnIDs.size
 )

 val kilroyUserTags = svec(
   (userTagsIDS.columnIDs.get(tagsMap("classical")).get, 1) ::
   (userTagsIDS.columnIDs.get(tagsMap("skacore")).get, 1) ::
   (userTagsIDS.columnIDs.get(tagsMap("why on earth is this just a bonus track")).get, 1) ::
   (userTagsIDS.columnIDs.get(tagsMap("punk rock")).get, 1) :: Nil, cardinality = userTagsIDS.columnIDs.size)

 val kilroysRecs = (artistReccosLlrDrmListByArtist(0).matrix %*% kilroyUserArtists + artistReccosLlrDrmListByArtist(1).matrix %*% kilroyUserTags).collect


 import org.apache.mahout.math.scalabindings.MahoutCollections._
 import collection._
 import JavaConversions._

 // Which Users I should Be Friends with.
 println(kilroysRecs(::, 0).toMap.toList.sortWith(_._2 > _._2).take(5))

 /**
   * So there you have it- the basis for a new dating/friend finding app based on musical preferences which
   * is actually a pretty dope idea.
   *
   * Solving for which bands a user might like is left as an exercise to the reader.
   */
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/*
	* Download data from: http://files.grouplens.org/datasets/hetrec2011/hetrec2011-lastfm-2k.zip
	* then run this in the mahout shell.
	*/

	import org.apache.mahout.sparkbindings.indexeddataset.IndexedDatasetSpark

	// We need to turn our raw text files into RDD[(String, String)]
	val userTagsRDD = sc.textFile("/path/to/lastfm/user_taggedartists.dat").map(line => line.split("\t")).map(a => (a(0), a(2))).filter(_._1 != "userID")
	val userTagsIDS = IndexedDatasetSpark.apply(userTagsRDD)(sc)

	val userArtistsRDD = sc.textFile("/path/to/lastfm/user_artists.dat").map(line => line.split("\t")).map(a => (a(0), a(1))).filter(_._1 != "userID")
	val userArtistsIDS = IndexedDatasetSpark.apply(userArtistsRDD)(sc)

	val userFriendsRDD = sc.textFile("/path/to/data/lastfm/user_friends.dat").map(line => line.split("\t")).map(a => (a(0), a(1))).filter(_._1 != "userID")
	val userFriendsIDS = IndexedDatasetSpark.apply(userFriendsRDD)(sc)

	val primaryIDS = userFriendsIDS
	val secondaryActionRDDs = List(userArtistsRDD, userTagsRDD)

	import org.apache.mahout.math.indexeddataset.{IndexedDataset, BiDictionary}

	def adjustRowCardinality(rowCardinality: Integer, datasetA: IndexedDataset): IndexedDataset = {
	val returnedA = if (rowCardinality != datasetA.matrix.nrow) datasetA.newRowCardinality(rowCardinality)
	else datasetA // this guarantees matching cardinality

	returnedA
	}

	var rowCardinality = primaryIDS.rowIDs.size

	val secondaryActionIDS: Array[IndexedDataset] = new Array[IndexedDataset](secondaryActionRDDs.length)
	for (i <- secondaryActionRDDs.indices) {

	val bcPrimaryRowIDs = sc.broadcast(primaryIDS.rowIDs)
	bcPrimaryRowIDs.value

	val tempRDD = secondaryActionRDDs(i).filter(a => bcPrimaryRowIDs.value.contains(a._1))

	var tempIDS = IndexedDatasetSpark.apply(tempRDD, existingRowIDs = Some(primaryIDS.rowIDs))(sc)
	secondaryActionIDS(i) = adjustRowCardinality(rowCardinality,tempIDS)
	}

	import org.apache.mahout.math.cf.SimilarityAnalysis

	val artistReccosLlrDrmListByArtist = SimilarityAnalysis.cooccurrencesIDSs(
	Array(primaryIDS, secondaryActionIDS(0), secondaryActionIDS(1)),
	maxInterestingItemsPerThing = 20,
	maxNumInteractions = 500,
	randomSeed = 1234)
	// Anonymous User

	val artistMap = sc.textFile("/path/to/lastfm/artists.dat").map(line => line.split("\t")).map(a => (a(1), a(0))).filter(_._1 != "name").collect.toMap
	val tagsMap = sc.textFile("/path/to/lastfm/tags.dat").map(line => line.split("\t")).map(a => (a(1), a(0))).filter(_._1 != "tagValue").collect.toMap

	// Watch your skin- you're not wearing armour. (This will fail on misspelled artists
	// This is neccessary because the ids are integer-strings already, and for this demo I didn't want to chance them to Integer types (bc more often you'll have strings).
	val kilroyUserArtists = svec( (userArtistsIDS.columnIDs.get(artistMap("Beck")).get, 1) ::
	(userArtistsIDS.columnIDs.get(artistMap("David Bowie")).get, 1) ::
	(userArtistsIDS.columnIDs.get(artistMap("Gary Numan")).get, 1) ::
	(userArtistsIDS.columnIDs.get(artistMap("Less Than Jake")).get, 1) ::
	(userArtistsIDS.columnIDs.get(artistMap("Lou Reed")).get, 1) ::
	(userArtistsIDS.columnIDs.get(artistMap("Parliament")).get, 1) ::
	(userArtistsIDS.columnIDs.get(artistMap("Radiohead")).get, 1) ::
	(userArtistsIDS.columnIDs.get(artistMap("Seu Jorge")).get, 1) ::
	(userArtistsIDS.columnIDs.get(artistMap("The Skatalites")).get, 1) ::
	(userArtistsIDS.columnIDs.get(artistMap("Reverend Horton Heat")).get, 1) ::
	(userArtistsIDS.columnIDs.get(artistMap("Talking Heads")).get, 1) ::
	(userArtistsIDS.columnIDs.get(artistMap("Tom Waits")).get, 1) ::
	(userArtistsIDS.columnIDs.get(artistMap("Waylon Jennings")).get, 1) ::
	(userArtistsIDS.columnIDs.get(artistMap("Wu-Tang Clan")).get, 1) :: Nil, cardinality = userArtistsIDS.columnIDs.size
	)

	val kilroyUserTags = svec(
	(userTagsIDS.columnIDs.get(tagsMap("classical")).get, 1) ::
	(userTagsIDS.columnIDs.get(tagsMap("skacore")).get, 1) ::
	(userTagsIDS.columnIDs.get(tagsMap("why on earth is this just a bonus track")).get, 1) ::
	(userTagsIDS.columnIDs.get(tagsMap("punk rock")).get, 1) :: Nil, cardinality = userTagsIDS.columnIDs.size)

	val kilroysRecs = (artistReccosLlrDrmListByArtist(0).matrix %% kilroyUserArtists + artistReccosLlrDrmListByArtist(1).matrix %% kilroyUserTags).collect


	import org.apache.mahout.math.scalabindings.MahoutCollections._
	import collection._
	import JavaConversions._

	// Which Users I should Be Friends with.
	println(kilroysRecs(::, 0).toMap.toList.sortWith(_._2 > _._2).take(5))

	/**
	* So there you have it- the basis for a new dating/friend finding app based on musical preferences which
	* is actually a pretty dope idea.
	*
	* Solving for which bands a user might like is left as an exercise to the reader.
	*/