blob: 709ab2a7853be5cd5248a638a93cdd8269bc465f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* Download data from: http://files.grouplens.org/datasets/hetrec2011/hetrec2011-lastfm-2k.zip
* then run this in the mahout shell.
*/
import org.apache.mahout.sparkbindings.indexeddataset.IndexedDatasetSpark
// We need to turn our raw text files into RDD[(String, String)]
val userTagsRDD = sc.textFile("/path/to/lastfm/user_taggedartists.dat").map(line => line.split("\t")).map(a => (a(0), a(2))).filter(_._1 != "userID")
val userTagsIDS = IndexedDatasetSpark.apply(userTagsRDD)(sc)
val userArtistsRDD = sc.textFile("/path/to/lastfm/user_artists.dat").map(line => line.split("\t")).map(a => (a(0), a(1))).filter(_._1 != "userID")
val userArtistsIDS = IndexedDatasetSpark.apply(userArtistsRDD)(sc)
val userFriendsRDD = sc.textFile("/path/to/data/lastfm/user_friends.dat").map(line => line.split("\t")).map(a => (a(0), a(1))).filter(_._1 != "userID")
val userFriendsIDS = IndexedDatasetSpark.apply(userFriendsRDD)(sc)
val primaryIDS = userFriendsIDS
val secondaryActionRDDs = List(userArtistsRDD, userTagsRDD)
import org.apache.mahout.math.indexeddataset.{IndexedDataset, BiDictionary}
def adjustRowCardinality(rowCardinality: Integer, datasetA: IndexedDataset): IndexedDataset = {
val returnedA = if (rowCardinality != datasetA.matrix.nrow) datasetA.newRowCardinality(rowCardinality)
else datasetA // this guarantees matching cardinality
returnedA
}
var rowCardinality = primaryIDS.rowIDs.size
val secondaryActionIDS: Array[IndexedDataset] = new Array[IndexedDataset](secondaryActionRDDs.length)
for (i <- secondaryActionRDDs.indices) {
val bcPrimaryRowIDs = sc.broadcast(primaryIDS.rowIDs)
bcPrimaryRowIDs.value
val tempRDD = secondaryActionRDDs(i).filter(a => bcPrimaryRowIDs.value.contains(a._1))
var tempIDS = IndexedDatasetSpark.apply(tempRDD, existingRowIDs = Some(primaryIDS.rowIDs))(sc)
secondaryActionIDS(i) = adjustRowCardinality(rowCardinality,tempIDS)
}
import org.apache.mahout.math.cf.SimilarityAnalysis
val artistReccosLlrDrmListByArtist = SimilarityAnalysis.cooccurrencesIDSs(
Array(primaryIDS, secondaryActionIDS(0), secondaryActionIDS(1)),
maxInterestingItemsPerThing = 20,
maxNumInteractions = 500,
randomSeed = 1234)
// Anonymous User
val artistMap = sc.textFile("/path/to/lastfm/artists.dat").map(line => line.split("\t")).map(a => (a(1), a(0))).filter(_._1 != "name").collect.toMap
val tagsMap = sc.textFile("/path/to/lastfm/tags.dat").map(line => line.split("\t")).map(a => (a(1), a(0))).filter(_._1 != "tagValue").collect.toMap
// Watch your skin- you're not wearing armour. (This will fail on misspelled artists
// This is neccessary because the ids are integer-strings already, and for this demo I didn't want to chance them to Integer types (bc more often you'll have strings).
val kilroyUserArtists = svec( (userArtistsIDS.columnIDs.get(artistMap("Beck")).get, 1) ::
(userArtistsIDS.columnIDs.get(artistMap("David Bowie")).get, 1) ::
(userArtistsIDS.columnIDs.get(artistMap("Gary Numan")).get, 1) ::
(userArtistsIDS.columnIDs.get(artistMap("Less Than Jake")).get, 1) ::
(userArtistsIDS.columnIDs.get(artistMap("Lou Reed")).get, 1) ::
(userArtistsIDS.columnIDs.get(artistMap("Parliament")).get, 1) ::
(userArtistsIDS.columnIDs.get(artistMap("Radiohead")).get, 1) ::
(userArtistsIDS.columnIDs.get(artistMap("Seu Jorge")).get, 1) ::
(userArtistsIDS.columnIDs.get(artistMap("The Skatalites")).get, 1) ::
(userArtistsIDS.columnIDs.get(artistMap("Reverend Horton Heat")).get, 1) ::
(userArtistsIDS.columnIDs.get(artistMap("Talking Heads")).get, 1) ::
(userArtistsIDS.columnIDs.get(artistMap("Tom Waits")).get, 1) ::
(userArtistsIDS.columnIDs.get(artistMap("Waylon Jennings")).get, 1) ::
(userArtistsIDS.columnIDs.get(artistMap("Wu-Tang Clan")).get, 1) :: Nil, cardinality = userArtistsIDS.columnIDs.size
)
val kilroyUserTags = svec(
(userTagsIDS.columnIDs.get(tagsMap("classical")).get, 1) ::
(userTagsIDS.columnIDs.get(tagsMap("skacore")).get, 1) ::
(userTagsIDS.columnIDs.get(tagsMap("why on earth is this just a bonus track")).get, 1) ::
(userTagsIDS.columnIDs.get(tagsMap("punk rock")).get, 1) :: Nil, cardinality = userTagsIDS.columnIDs.size)
val kilroysRecs = (artistReccosLlrDrmListByArtist(0).matrix %*% kilroyUserArtists + artistReccosLlrDrmListByArtist(1).matrix %*% kilroyUserTags).collect
import org.apache.mahout.math.scalabindings.MahoutCollections._
import collection._
import JavaConversions._
// Which Users I should Be Friends with.
println(kilroysRecs(::, 0).toMap.toList.sortWith(_._2 > _._2).take(5))
/**
* So there you have it- the basis for a new dating/friend finding app based on musical preferences which
* is actually a pretty dope idea.
*
* Solving for which bands a user might like is left as an exercise to the reader.
*/