Skip to content

Commit 0f561ff

Browse files
Merge pull request #13 from rajdeepd/master
changes in chapter 4 scala code - added package
2 parents 7bf10ac + b08a9d9 commit 0f561ff

File tree

6 files changed

+72
-20
lines changed

6 files changed

+72
-20
lines changed

Chapter_04/python/code04.ipynb

Lines changed: 21 additions & 13 deletions
Large diffs are not rendered by default.

Chapter_04/scala/build.sbt

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,13 @@
1-
name := "code03"
1+
name := "chapter04"
22

33
version := "1.0"
44

55
scalaVersion := "2.10.4"
66

77
libraryDependencies += "org.apache.spark" %% "spark-core" % "1.6.0"
8+
libraryDependencies += "org.scalanlp" %% "breeze" % "0.12"
9+
// native libraries are not included by default. add this if you want them (as of 0.7)
10+
// native libraries greatly improve performance, but increase jar sizes.
11+
// It also packages various blas implementations, which have licenses that may or may not
12+
// be compatible with the Apache License. No GPL code, as best I know.
13+
libraryDependencies +="org.scalanlp" %% "breeze-natives" % "0.12"

Chapter_04/scala/src/main/scala/MovieData.scala renamed to Chapter_04/scala/src/main/scala/org/sparksamples/MovieData.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
package org.sparksamples
2+
13
/**
24
* Created by Rajdeep on 12/22/15.
35
*/

Chapter_04/scala/src/main/scala/RatingData.scala renamed to Chapter_04/scala/src/main/scala/org/sparksamples/RatingData.scala

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,19 @@
1+
package org.sparksamples
2+
13
/**
24
* Created by Rajdeep on 12/22/15.
35
*/
46
import org.apache.spark.SparkContext
7+
import breeze.linalg.DenseVector
58

69
object RatingData {
10+
val util = new Util()
11+
val sc = new SparkContext("local[2]", "First Spark App")
12+
util.sc = sc
13+
val user_data = util.getUserData()
714

815
def main(args: Array[String]) {
9-
val sc = new SparkContext("local[2]", "First Spark App")
16+
1017
val rating_data_raw = sc.textFile("../../data/ml-100k/u.data")
1118

1219
println(rating_data_raw.first())
@@ -28,9 +35,9 @@ object RatingData {
2835
println("min_rating: " + min_rating)
2936
println("mean_rating: " + mean_rating)
3037

31-
var user_data = sc.textFile("../../data/ml-100k/u.user")
32-
user_data = user_data.map(l => l.replaceAll("[|]", ","))
33-
val user_fields = user_data.map(l => l.split(","))
38+
println("user_data.first():" + user_data.first())
39+
val user_fields = user_data.map(l => l.split("\\|"))
40+
//var num_users = util.getUserFields()
3441
val num_users = user_fields.map(l => l(0)).count()
3542
//val median_rating = math.median(ratings.collect()) function not supported - TODO
3643
val ratings_per_user = num_ratings / num_users
@@ -70,7 +77,21 @@ object RatingData {
7077

7178
println("Encoding of 'doctor : " + all_occupations_dict("doctor"))
7279
println("Encoding of 'programmer' : " + all_occupations_dict("programmer"))
73-
80+
//println(all_occupations_dict)
81+
/*
82+
K = len(all_occupations_dict)
83+
binary_x = np.zeros(K)
84+
k_programmer = all_occupations_dict['programmer']
85+
binary_x[k_programmer] = 1
86+
print "Binary feature vector: %s" % binary_x
87+
print "Length of binary vector: %d" % K
88+
*/
89+
val k = all_occupations_dict.size
90+
val binary_x = DenseVector.zeros[Double](k)
91+
val k_programmer = all_occupations_dict("programmer")
92+
binary_x(k_programmer) = 1
93+
println("Binary feature vector: %s" + binary_x)
94+
println("Length of binary vector: " + k)
7495
sc.stop()
7596
}
7697

Chapter_04/scala/src/main/scala/UserData.scala renamed to Chapter_04/scala/src/main/scala/org/sparksamples/UserData.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
package org.sparksamples
2+
13
/**
24
* Created by Rajdeep on 12/22/15.
35
*/

Chapter_04/scala/src/main/scala/Util.scala renamed to Chapter_04/scala/src/main/scala/org/sparksamples/Util.scala

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
package org.sparksamples
2+
13
import org.apache.spark.SparkContext
24
import org.apache.spark.rdd.RDD
35

@@ -8,7 +10,6 @@ class Util {
810
var sc:SparkContext = null
911

1012
def getMovieDataRDD(): RDD[String] = {
11-
1213
val movie_data = sc.textFile("../../data/ml-100k/u.item")
1314
return movie_data
1415
}
@@ -25,5 +26,17 @@ class Util {
2526
return x.sum/x.length
2627
}
2728

29+
def getUserData() : RDD[String] = {
30+
//val sc = new SparkContext("local[2]", "First Spark App")
31+
var user_data = sc.textFile("../../data/ml-100k/u.user")
32+
return user_data
33+
}
34+
35+
def getUserFields() : RDD[Array[String]] = {
36+
val user_data = this.getUserData()
37+
val user_fields = user_data.map(l => l.split(","))
38+
return user_fields
39+
}
40+
2841

2942
}

0 commit comments

Comments
 (0)