-
Notifications
You must be signed in to change notification settings - Fork 1
/
SparkPageRank_Checkpointing.scala
executable file
·98 lines (90 loc) · 3.57 KB
/
SparkPageRank_Checkpointing.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.spark.scheduler.{SparkListener, SparkListenerJobEnd, SparkListenerJobStart}
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.JavaConversions._
/**
* Computes the PageRank of URLs from an input file. Input file should
* be in format of:
* URL neighbor URL
* URL neighbor URL
* URL neighbor URL
* ...
* where URL and their neighbors are separated by space(s).
*
* This is an example implementation for learning how to use Spark. For more conventional use,
* please refer to org.apache.spark.graphx.lib.PageRank
*/
object SparkPageRank_Checkpointing {
def showWarning() {
System.err.println(
"""WARN: This is a naive implementation of PageRank and is given as an example!
|Please use the PageRank implementation found in org.apache.spark.graphx.lib.PageRank
|for more conventional use.
""".stripMargin)
}
def main(args: Array[String]) {
if (args.length < 1) {
System.err.println("Usage: SparkPageRank <file> <iter>")
System.exit(1)
}
showWarning()
val sparkConf = new SparkConf().setAppName("PageRank with checkpoint + listener")
val ctx = new SparkContext(sparkConf)
//val sc = new SparkContext(sparkConf)
ctx.addSparkListener(new SparkListener() {
override def onJobStart(jobStart: SparkListenerJobStart) {
super.onJobStart(jobStart)
println("ADAPT: INSIDE Job Start Listener ");
val props = propertiesAsScalaMap(jobStart.properties)
if (props.contains("spark.rdd.scope")) {
if (props.contains("name") && props("name") == "checkpoint") {
println("JobID " + jobStart.jobId);
println("This is a checkpointing job for RDD - " + props("id"))
println("StartTime - " + jobStart.time)
}
}
}
override def onJobEnd(jobEnd: SparkListenerJobEnd) {
super.onJobEnd(jobEnd)
println("ADAPT: Inside Job end Listener ");
println("JobID " + jobEnd.jobId);
println("EndTime " + jobEnd.time);
}
});
val iters = if (args.length > 1) args(1).toInt else 10
ctx.setCheckpointDir("/checkpoint-dir")
val lines = ctx.textFile(args(0), 1)
val links = lines.map{ s =>
val parts = s.split("\\s+")
(parts(0), parts(1))
}.distinct().groupByKey().cache()
var ranks = links.mapValues(v => 1.0)
for (i <- 1 to iters) {
val contribs = links.join(ranks).values.flatMap{ case (urls, rank) =>
val size = urls.size
urls.map(url => (url, rank / size))
}
ranks = contribs.reduceByKey(_ + _).mapValues(0.15 + 0.85 * _)
ranks.checkpoint()
}
val output = ranks.collect()
output.foreach(tup => println(tup._1 + " has rank: " + tup._2 + "."))
ctx.stop()
}
}
// scalastyle:on println