1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
|
19 | |
package org.apache.giraph.graph; |
20 | |
|
21 | |
import java.io.IOException; |
22 | |
import java.lang.management.GarbageCollectorMXBean; |
23 | |
import java.lang.management.ManagementFactory; |
24 | |
import java.util.ArrayList; |
25 | |
import java.util.Collection; |
26 | |
import java.util.Enumeration; |
27 | |
import java.util.List; |
28 | |
import java.util.concurrent.Callable; |
29 | |
import java.util.concurrent.TimeUnit; |
30 | |
|
31 | |
import com.sun.management.GarbageCollectionNotificationInfo; |
32 | |
import com.yammer.metrics.core.Counter; |
33 | |
|
34 | |
import org.apache.commons.lang3.exception.ExceptionUtils; |
35 | |
import org.apache.giraph.bsp.BspService; |
36 | |
import org.apache.giraph.bsp.CentralizedServiceMaster; |
37 | |
import org.apache.giraph.bsp.CentralizedServiceWorker; |
38 | |
import org.apache.giraph.bsp.checkpoints.CheckpointStatus; |
39 | |
import org.apache.giraph.comm.messages.MessageStore; |
40 | |
import org.apache.giraph.conf.ClassConfOption; |
41 | |
import org.apache.giraph.conf.GiraphConstants; |
42 | |
import org.apache.giraph.conf.ImmutableClassesGiraphConfiguration; |
43 | |
import org.apache.giraph.job.JobProgressTracker; |
44 | |
import org.apache.giraph.master.BspServiceMaster; |
45 | |
import org.apache.giraph.master.MasterThread; |
46 | |
import org.apache.giraph.metrics.GiraphMetrics; |
47 | |
import org.apache.giraph.metrics.GiraphMetricsRegistry; |
48 | |
import org.apache.giraph.metrics.GiraphTimer; |
49 | |
import org.apache.giraph.metrics.GiraphTimerContext; |
50 | |
import org.apache.giraph.metrics.ResetSuperstepMetricsObserver; |
51 | |
import org.apache.giraph.metrics.SuperstepMetricsRegistry; |
52 | |
import org.apache.giraph.ooc.OutOfCoreEngine; |
53 | |
import org.apache.giraph.partition.PartitionOwner; |
54 | |
import org.apache.giraph.partition.PartitionStats; |
55 | |
import org.apache.giraph.partition.PartitionStore; |
56 | |
import org.apache.giraph.scripting.ScriptLoader; |
57 | |
import org.apache.giraph.utils.CallableFactory; |
58 | |
import org.apache.giraph.utils.GcObserver; |
59 | |
import org.apache.giraph.utils.MemoryUtils; |
60 | |
import org.apache.giraph.utils.ProgressableUtils; |
61 | |
import org.apache.giraph.worker.BspServiceWorker; |
62 | |
import org.apache.giraph.worker.InputSplitsCallable; |
63 | |
import org.apache.giraph.worker.WorkerContext; |
64 | |
import org.apache.giraph.worker.WorkerObserver; |
65 | |
import org.apache.giraph.worker.WorkerProgress; |
66 | |
import org.apache.giraph.writable.kryo.KryoWritableWrapper; |
67 | |
import org.apache.giraph.zk.ZooKeeperManager; |
68 | |
import org.apache.hadoop.conf.Configuration; |
69 | |
import org.apache.hadoop.fs.Path; |
70 | |
import org.apache.hadoop.io.Writable; |
71 | |
import org.apache.hadoop.io.WritableComparable; |
72 | |
import org.apache.hadoop.mapreduce.Mapper; |
73 | |
import org.apache.log4j.Appender; |
74 | |
import org.apache.log4j.Level; |
75 | |
import org.apache.log4j.LogManager; |
76 | |
import org.apache.log4j.Logger; |
77 | |
import org.apache.log4j.PatternLayout; |
78 | |
|
79 | |
import javax.management.Notification; |
80 | |
import javax.management.NotificationEmitter; |
81 | |
import javax.management.NotificationListener; |
82 | |
import javax.management.openmbean.CompositeData; |
83 | |
|
84 | |
|
85 | |
|
86 | |
|
87 | |
|
88 | |
|
89 | |
|
90 | |
|
91 | |
|
92 | |
|
93 | |
|
94 | |
|
95 | |
@SuppressWarnings("rawtypes") |
96 | 0 | public class GraphTaskManager<I extends WritableComparable, V extends Writable, |
97 | |
E extends Writable> implements |
98 | |
ResetSuperstepMetricsObserver { |
99 | |
|
100 | |
|
101 | |
|
102 | |
|
103 | |
|
104 | |
|
105 | |
|
106 | |
|
107 | |
|
108 | |
public static final ClassConfOption<CheckerIfWorkerShouldFailAfterException> |
109 | 0 | CHECKER_IF_WORKER_SHOULD_FAIL_AFTER_EXCEPTION_CLASS = ClassConfOption.create( |
110 | |
"giraph.checkerIfWorkerShouldFailAfterExceptionClass", |
111 | |
FailWithEveryExceptionOccurred.class, |
112 | |
CheckerIfWorkerShouldFailAfterException.class, |
113 | |
"Class which checks if an exception on some thread should cause worker " + |
114 | |
"to fail, by default all exceptions cause failure"); |
115 | |
|
116 | |
public static final String TIMER_SUPERSTEP_TIME = "superstep-time-ms"; |
117 | |
|
118 | |
public static final String TIMER_COMPUTE_ALL = "compute-all-ms"; |
119 | |
|
120 | |
public static final String TIMER_TIME_TO_FIRST_MSG = |
121 | |
"time-to-first-message-ms"; |
122 | |
|
123 | |
public static final String TIMER_COMMUNICATION_TIME = "communication-time-ms"; |
124 | |
|
125 | |
public static final String TIMER_SUPERSTEP_GC_TIME = "superstep-gc-time-ms"; |
126 | |
|
127 | |
|
128 | 0 | private static final Logger LOG = Logger.getLogger(GraphTaskManager.class); |
129 | |
|
130 | |
private CentralizedServiceWorker<I, V, E> serviceWorker; |
131 | |
|
132 | |
private CentralizedServiceMaster<I, V, E> serviceMaster; |
133 | |
|
134 | 0 | private Thread masterThread = null; |
135 | |
|
136 | 0 | private boolean alreadyRun = false; |
137 | |
|
138 | |
private ZooKeeperManager zkManager; |
139 | |
|
140 | |
private ImmutableClassesGiraphConfiguration<I, V, E> conf; |
141 | |
|
142 | 0 | private boolean done = false; |
143 | |
|
144 | 0 | private GraphFunctions graphFunctions = GraphFunctions.UNKNOWN; |
145 | |
|
146 | 0 | private FinishedSuperstepStats finishedSuperstepStats = |
147 | |
new FinishedSuperstepStats(0, false, 0, 0, false, CheckpointStatus.NONE); |
148 | |
|
149 | |
private JobProgressTrackerClient jobProgressTracker; |
150 | |
|
151 | |
|
152 | |
|
153 | |
private GiraphTimer wcPreAppTimer; |
154 | |
|
155 | |
private GiraphTimer wcPostAppTimer; |
156 | |
|
157 | |
|
158 | |
|
159 | |
private GiraphTimer superstepTimer; |
160 | |
|
161 | |
private GiraphTimer computeAll; |
162 | |
|
163 | |
private GiraphTimer timeToFirstMessage; |
164 | |
|
165 | |
private GiraphTimerContext timeToFirstMessageTimerContext; |
166 | |
|
167 | |
private GiraphTimer communicationTimer; |
168 | |
|
169 | |
private GiraphTimerContext communicationTimerContext; |
170 | |
|
171 | |
private GiraphTimer wcPreSuperstepTimer; |
172 | |
|
173 | |
private Counter gcTimeMetric; |
174 | |
|
175 | |
private final Mapper<?, ?, ?, ?>.Context context; |
176 | |
|
177 | |
private boolean isMaster; |
178 | |
|
179 | |
private MapperObserver[] mapperObservers; |
180 | |
|
181 | |
|
182 | |
|
183 | |
|
184 | |
|
185 | |
|
186 | 0 | public GraphTaskManager(Mapper<?, ?, ?, ?>.Context context) { |
187 | 0 | this.context = context; |
188 | 0 | this.isMaster = false; |
189 | 0 | } |
190 | |
|
191 | |
|
192 | |
|
193 | |
|
194 | |
private void checkInput() { |
195 | 0 | if (conf.hasEdgeInputFormat()) { |
196 | 0 | conf.createWrappedEdgeInputFormat().checkInputSpecs(conf); |
197 | |
} |
198 | 0 | if (conf.hasVertexInputFormat()) { |
199 | 0 | conf.createWrappedVertexInputFormat().checkInputSpecs(conf); |
200 | |
} |
201 | 0 | } |
202 | |
|
203 | |
|
204 | |
|
205 | |
|
206 | |
|
207 | |
|
208 | |
|
209 | |
|
210 | |
private void createZooKeeperCounter(String serverPortList) { |
211 | |
|
212 | 0 | context.getCounter(GiraphConstants.ZOOKEEPER_SERVER_PORT_COUNTER_GROUP, |
213 | |
serverPortList); |
214 | 0 | } |
215 | |
|
216 | |
|
217 | |
|
218 | |
|
219 | |
|
220 | |
|
221 | |
public void setup(Path[] zkPathList) |
222 | |
throws IOException, InterruptedException { |
223 | 0 | context.setStatus("setup: Beginning worker setup."); |
224 | 0 | Configuration hadoopConf = context.getConfiguration(); |
225 | 0 | conf = new ImmutableClassesGiraphConfiguration<I, V, E>(hadoopConf); |
226 | 0 | initializeJobProgressTracker(); |
227 | |
|
228 | 0 | Thread.setDefaultUncaughtExceptionHandler(createUncaughtExceptionHandler()); |
229 | 0 | setupMapperObservers(); |
230 | |
|
231 | |
|
232 | |
|
233 | 0 | conf.getGiraphTypes().writeIfUnset(conf); |
234 | |
|
235 | 0 | initializeAndConfigureLogging(); |
236 | |
|
237 | 0 | setupAndInitializeGiraphMetrics(); |
238 | |
|
239 | 0 | checkInput(); |
240 | |
|
241 | 0 | ScriptLoader.loadScripts(conf); |
242 | |
|
243 | 0 | conf.createComputationFactory().initialize(conf); |
244 | |
|
245 | 0 | context.setStatus("setup: Initializing Zookeeper services."); |
246 | 0 | String serverPortList = conf.getZookeeperList(); |
247 | 0 | if (serverPortList.isEmpty()) { |
248 | 0 | if (startZooKeeperManager()) { |
249 | 0 | return; |
250 | |
} |
251 | |
} else { |
252 | 0 | createZooKeeperCounter(serverPortList); |
253 | |
} |
254 | 0 | if (zkManager != null && zkManager.runsZooKeeper()) { |
255 | 0 | if (LOG.isInfoEnabled()) { |
256 | 0 | LOG.info("setup: Chosen to run ZooKeeper..."); |
257 | |
} |
258 | |
} |
259 | 0 | context |
260 | 0 | .setStatus("setup: Connected to Zookeeper service " + serverPortList); |
261 | 0 | this.graphFunctions = determineGraphFunctions(conf, zkManager); |
262 | 0 | if (zkManager != null && this.graphFunctions.isMaster()) { |
263 | 0 | zkManager.cleanupOnExit(); |
264 | |
} |
265 | |
try { |
266 | 0 | instantiateBspService(); |
267 | 0 | } catch (IOException e) { |
268 | 0 | LOG.error("setup: Caught exception just before end of setup", e); |
269 | 0 | if (zkManager != null) { |
270 | 0 | zkManager.offlineZooKeeperServers(ZooKeeperManager.State.FAILED); |
271 | |
} |
272 | 0 | throw new RuntimeException( |
273 | |
"setup: Offlining servers due to exception...", e); |
274 | 0 | } |
275 | 0 | context.setStatus(getGraphFunctions().toString() + " starting..."); |
276 | 0 | } |
277 | |
|
278 | |
|
279 | |
|
280 | |
|
281 | |
|
282 | |
|
283 | |
private void initializeJobProgressTracker() { |
284 | 0 | if (!conf.trackJobProgressOnClient()) { |
285 | 0 | jobProgressTracker = new JobProgressTrackerClientNoOp(); |
286 | |
} else { |
287 | 0 | jobProgressTracker = |
288 | 0 | GiraphConstants.JOB_PROGRESS_TRACKER_CLIENT_CLASS.newInstance(conf); |
289 | |
try { |
290 | 0 | jobProgressTracker.init(conf); |
291 | |
|
292 | 0 | } catch (Exception e) { |
293 | |
|
294 | 0 | throw new RuntimeException( |
295 | |
"Failed to initialize JobProgressTrackerClient", e); |
296 | 0 | } |
297 | |
} |
298 | 0 | jobProgressTracker.mapperStarted(); |
299 | 0 | } |
300 | |
|
301 | |
|
302 | |
|
303 | |
|
304 | |
|
305 | |
|
306 | |
|
307 | |
|
308 | |
|
309 | |
public void execute() throws IOException, InterruptedException { |
310 | 0 | if (checkTaskState()) { |
311 | 0 | return; |
312 | |
} |
313 | 0 | preLoadOnWorkerObservers(); |
314 | 0 | GiraphTimerContext superstepTimerContext = superstepTimer.time(); |
315 | 0 | finishedSuperstepStats = serviceWorker.setup(); |
316 | 0 | superstepTimerContext.stop(); |
317 | 0 | if (collectInputSuperstepStats(finishedSuperstepStats)) { |
318 | 0 | return; |
319 | |
} |
320 | 0 | prepareGraphStateAndWorkerContext(); |
321 | 0 | List<PartitionStats> partitionStatsList = new ArrayList<PartitionStats>(); |
322 | 0 | int numComputeThreads = conf.getNumComputeThreads(); |
323 | |
|
324 | |
|
325 | 0 | while (!finishedSuperstepStats.allVerticesHalted()) { |
326 | 0 | final long superstep = serviceWorker.getSuperstep(); |
327 | 0 | superstepTimerContext = getTimerForThisSuperstep(superstep); |
328 | 0 | GraphState graphState = new GraphState(superstep, |
329 | 0 | finishedSuperstepStats.getVertexCount(), |
330 | 0 | finishedSuperstepStats.getEdgeCount(), |
331 | |
context); |
332 | 0 | Collection<? extends PartitionOwner> masterAssignedPartitionOwners = |
333 | 0 | serviceWorker.startSuperstep(); |
334 | 0 | if (LOG.isDebugEnabled()) { |
335 | 0 | LOG.debug("execute: " + MemoryUtils.getRuntimeMemoryStats()); |
336 | |
} |
337 | 0 | context.progress(); |
338 | 0 | serviceWorker.exchangeVertexPartitions(masterAssignedPartitionOwners); |
339 | 0 | context.progress(); |
340 | 0 | boolean hasBeenRestarted = checkSuperstepRestarted(superstep); |
341 | |
|
342 | 0 | GlobalStats globalStats = serviceWorker.getGlobalStats(); |
343 | |
|
344 | 0 | if (hasBeenRestarted) { |
345 | 0 | graphState = new GraphState(superstep, |
346 | 0 | finishedSuperstepStats.getVertexCount(), |
347 | 0 | finishedSuperstepStats.getEdgeCount(), |
348 | |
context); |
349 | 0 | } else if (storeCheckpoint(globalStats.getCheckpointStatus())) { |
350 | 0 | break; |
351 | |
} |
352 | 0 | serviceWorker.getServerData().prepareResolveMutations(); |
353 | 0 | context.progress(); |
354 | 0 | prepareForSuperstep(graphState); |
355 | 0 | context.progress(); |
356 | 0 | MessageStore<I, Writable> messageStore = |
357 | 0 | serviceWorker.getServerData().getCurrentMessageStore(); |
358 | 0 | int numPartitions = serviceWorker.getPartitionStore().getNumPartitions(); |
359 | 0 | int numThreads = Math.min(numComputeThreads, numPartitions); |
360 | 0 | if (LOG.isInfoEnabled()) { |
361 | 0 | LOG.info("execute: " + numPartitions + " partitions to process with " + |
362 | |
numThreads + " compute thread(s), originally " + |
363 | |
numComputeThreads + " thread(s) on superstep " + superstep); |
364 | |
} |
365 | 0 | partitionStatsList.clear(); |
366 | |
|
367 | 0 | if (numPartitions > 0) { |
368 | 0 | processGraphPartitions(context, partitionStatsList, graphState, |
369 | |
messageStore, numThreads); |
370 | |
} |
371 | 0 | finishedSuperstepStats = completeSuperstepAndCollectStats( |
372 | |
partitionStatsList, superstepTimerContext); |
373 | |
|
374 | |
|
375 | 0 | } |
376 | |
|
377 | 0 | if (LOG.isInfoEnabled()) { |
378 | 0 | LOG.info("execute: BSP application done (global vertices marked done)"); |
379 | |
} |
380 | 0 | updateSuperstepGraphState(); |
381 | 0 | postApplication(); |
382 | 0 | } |
383 | |
|
384 | |
|
385 | |
|
386 | |
|
387 | |
private void postApplication() throws IOException, InterruptedException { |
388 | 0 | GiraphTimerContext postAppTimerContext = wcPostAppTimer.time(); |
389 | 0 | serviceWorker.getWorkerContext().postApplication(); |
390 | 0 | serviceWorker.getSuperstepOutput().postApplication(); |
391 | 0 | postAppTimerContext.stop(); |
392 | 0 | context.progress(); |
393 | |
|
394 | 0 | for (WorkerObserver obs : serviceWorker.getWorkerObservers()) { |
395 | 0 | obs.postApplication(); |
396 | 0 | context.progress(); |
397 | |
} |
398 | 0 | } |
399 | |
|
400 | |
|
401 | |
|
402 | |
|
403 | |
|
404 | |
public void setIsMaster(final boolean im) { |
405 | 0 | this.isMaster = im; |
406 | 0 | } |
407 | |
|
408 | |
|
409 | |
|
410 | |
|
411 | |
|
412 | |
|
413 | |
public boolean isMaster() { |
414 | 0 | return isMaster; |
415 | |
} |
416 | |
|
417 | |
|
418 | |
|
419 | |
|
420 | |
|
421 | |
|
422 | |
|
423 | |
private GiraphTimerContext getTimerForThisSuperstep(long superstep) { |
424 | 0 | GiraphMetrics.get().resetSuperstepMetrics(superstep); |
425 | 0 | return superstepTimer.time(); |
426 | |
} |
427 | |
|
428 | |
|
429 | |
|
430 | |
|
431 | |
private void setupAndInitializeGiraphMetrics() { |
432 | 0 | GiraphMetrics.init(conf); |
433 | 0 | GiraphMetrics.get().addSuperstepResetObserver(this); |
434 | 0 | initJobMetrics(); |
435 | 0 | MemoryUtils.initMetrics(); |
436 | 0 | InputSplitsCallable.initMetrics(); |
437 | 0 | } |
438 | |
|
439 | |
|
440 | |
|
441 | |
|
442 | |
|
443 | |
|
444 | |
|
445 | |
private boolean startZooKeeperManager() |
446 | |
throws IOException, InterruptedException { |
447 | 0 | zkManager = new ZooKeeperManager(context, conf); |
448 | 0 | context.setStatus("setup: Setting up Zookeeper manager."); |
449 | 0 | zkManager.setup(); |
450 | 0 | if (zkManager.computationDone()) { |
451 | 0 | done = true; |
452 | 0 | return true; |
453 | |
} |
454 | 0 | zkManager.onlineZooKeeperServer(); |
455 | 0 | String serverPortList = zkManager.getZooKeeperServerPortString(); |
456 | 0 | conf.setZookeeperList(serverPortList); |
457 | 0 | createZooKeeperCounter(serverPortList); |
458 | 0 | return false; |
459 | |
} |
460 | |
|
461 | |
|
462 | |
|
463 | |
|
464 | |
private void updateSuperstepGraphState() { |
465 | 0 | serviceWorker.getWorkerContext().setGraphState( |
466 | 0 | new GraphState(serviceWorker.getSuperstep(), |
467 | 0 | finishedSuperstepStats.getVertexCount(), |
468 | 0 | finishedSuperstepStats.getEdgeCount(), context)); |
469 | 0 | } |
470 | |
|
471 | |
|
472 | |
|
473 | |
|
474 | |
|
475 | |
|
476 | |
|
477 | |
|
478 | |
private FinishedSuperstepStats completeSuperstepAndCollectStats( |
479 | |
List<PartitionStats> partitionStatsList, |
480 | |
GiraphTimerContext superstepTimerContext) { |
481 | |
|
482 | |
|
483 | |
|
484 | |
|
485 | 0 | finishedSuperstepStats = |
486 | 0 | serviceWorker.finishSuperstep(partitionStatsList, superstepTimerContext); |
487 | 0 | if (conf.metricsEnabled()) { |
488 | 0 | GiraphMetrics.get().perSuperstep().printSummary(System.err); |
489 | |
} |
490 | 0 | return finishedSuperstepStats; |
491 | |
} |
492 | |
|
493 | |
|
494 | |
|
495 | |
|
496 | |
|
497 | |
|
498 | |
private void prepareForSuperstep(GraphState graphState) { |
499 | 0 | serviceWorker.prepareSuperstep(); |
500 | |
|
501 | 0 | serviceWorker.getWorkerContext().setGraphState(graphState); |
502 | 0 | serviceWorker.getWorkerContext().setupSuperstep(serviceWorker); |
503 | 0 | GiraphTimerContext preSuperstepTimer = wcPreSuperstepTimer.time(); |
504 | 0 | serviceWorker.getWorkerContext().preSuperstep(); |
505 | 0 | preSuperstepTimer.stop(); |
506 | 0 | context.progress(); |
507 | |
|
508 | 0 | for (WorkerObserver obs : serviceWorker.getWorkerObservers()) { |
509 | 0 | obs.preSuperstep(graphState.getSuperstep()); |
510 | 0 | context.progress(); |
511 | |
} |
512 | 0 | } |
513 | |
|
514 | |
|
515 | |
|
516 | |
|
517 | |
private void prepareGraphStateAndWorkerContext() { |
518 | 0 | updateSuperstepGraphState(); |
519 | 0 | workerContextPreApp(); |
520 | 0 | } |
521 | |
|
522 | |
|
523 | |
|
524 | |
|
525 | |
|
526 | |
|
527 | |
|
528 | |
public GraphFunctions getGraphFunctions() { |
529 | 0 | return graphFunctions; |
530 | |
} |
531 | |
|
532 | |
public final WorkerContext getWorkerContext() { |
533 | 0 | return serviceWorker.getWorkerContext(); |
534 | |
} |
535 | |
|
536 | |
public JobProgressTracker getJobProgressTracker() { |
537 | 0 | return jobProgressTracker; |
538 | |
} |
539 | |
|
540 | |
|
541 | |
|
542 | |
|
543 | |
|
544 | |
|
545 | |
|
546 | |
|
547 | |
|
548 | |
|
549 | |
|
550 | |
|
551 | |
|
552 | |
|
553 | |
|
554 | |
|
555 | |
|
556 | |
private static GraphFunctions determineGraphFunctions( |
557 | |
ImmutableClassesGiraphConfiguration conf, |
558 | |
ZooKeeperManager zkManager) { |
559 | 0 | boolean splitMasterWorker = conf.getSplitMasterWorker(); |
560 | 0 | int taskPartition = conf.getTaskPartition(); |
561 | 0 | boolean zkAlreadyProvided = conf.isZookeeperExternal(); |
562 | 0 | GraphFunctions functions = GraphFunctions.UNKNOWN; |
563 | |
|
564 | 0 | if (!splitMasterWorker) { |
565 | 0 | if ((zkManager != null) && zkManager.runsZooKeeper()) { |
566 | 0 | functions = GraphFunctions.ALL; |
567 | |
} else { |
568 | 0 | functions = GraphFunctions.ALL_EXCEPT_ZOOKEEPER; |
569 | |
} |
570 | |
} else { |
571 | 0 | if (zkAlreadyProvided) { |
572 | 0 | if (taskPartition == 0) { |
573 | 0 | functions = GraphFunctions.MASTER_ONLY; |
574 | |
} else { |
575 | 0 | functions = GraphFunctions.WORKER_ONLY; |
576 | |
} |
577 | |
} else { |
578 | 0 | if ((zkManager != null) && zkManager.runsZooKeeper()) { |
579 | 0 | functions = GraphFunctions.MASTER_ZOOKEEPER_ONLY; |
580 | |
} else { |
581 | 0 | functions = GraphFunctions.WORKER_ONLY; |
582 | |
} |
583 | |
} |
584 | |
} |
585 | 0 | return functions; |
586 | |
} |
587 | |
|
588 | |
|
589 | |
|
590 | |
|
591 | |
|
592 | |
private void instantiateBspService() |
593 | |
throws IOException, InterruptedException { |
594 | 0 | if (graphFunctions.isMaster()) { |
595 | 0 | if (LOG.isInfoEnabled()) { |
596 | 0 | LOG.info("setup: Starting up BspServiceMaster " + |
597 | |
"(master thread)..."); |
598 | |
} |
599 | 0 | serviceMaster = new BspServiceMaster<I, V, E>(context, this); |
600 | 0 | masterThread = new MasterThread<I, V, E>(serviceMaster, context); |
601 | 0 | masterThread.setUncaughtExceptionHandler( |
602 | 0 | createUncaughtExceptionHandler()); |
603 | 0 | masterThread.start(); |
604 | |
} |
605 | 0 | if (graphFunctions.isWorker()) { |
606 | 0 | if (LOG.isInfoEnabled()) { |
607 | 0 | LOG.info("setup: Starting up BspServiceWorker..."); |
608 | |
} |
609 | 0 | serviceWorker = new BspServiceWorker<I, V, E>(context, this); |
610 | 0 | installGCMonitoring(); |
611 | 0 | if (LOG.isInfoEnabled()) { |
612 | 0 | LOG.info("setup: Registering health of this worker..."); |
613 | |
} |
614 | |
} |
615 | 0 | } |
616 | |
|
617 | |
|
618 | |
|
619 | |
|
620 | |
|
621 | |
private void installGCMonitoring() { |
622 | 0 | final GcObserver[] gcObservers = conf.createGcObservers(context); |
623 | |
List<GarbageCollectorMXBean> mxBeans = ManagementFactory |
624 | 0 | .getGarbageCollectorMXBeans(); |
625 | 0 | final OutOfCoreEngine oocEngine = |
626 | 0 | serviceWorker.getServerData().getOocEngine(); |
627 | 0 | for (GarbageCollectorMXBean gcBean : mxBeans) { |
628 | 0 | NotificationEmitter emitter = (NotificationEmitter) gcBean; |
629 | 0 | NotificationListener listener = new NotificationListener() { |
630 | |
@Override |
631 | |
public void handleNotification(Notification notification, |
632 | |
Object handle) { |
633 | 0 | if (notification.getType().equals(GarbageCollectionNotificationInfo |
634 | |
.GARBAGE_COLLECTION_NOTIFICATION)) { |
635 | 0 | GarbageCollectionNotificationInfo info = |
636 | 0 | GarbageCollectionNotificationInfo.from( |
637 | 0 | (CompositeData) notification.getUserData()); |
638 | |
|
639 | 0 | if (LOG.isInfoEnabled()) { |
640 | 0 | LOG.info("installGCMonitoring: name = " + info.getGcName() + |
641 | 0 | ", action = " + info.getGcAction() + ", cause = " + |
642 | 0 | info.getGcCause() + ", duration = " + |
643 | 0 | info.getGcInfo().getDuration() + "ms"); |
644 | |
} |
645 | 0 | gcTimeMetric.inc(info.getGcInfo().getDuration()); |
646 | 0 | GiraphMetrics.get().getGcTracker().gcOccurred(info.getGcInfo()); |
647 | 0 | for (GcObserver gcObserver : gcObservers) { |
648 | 0 | gcObserver.gcOccurred(info); |
649 | |
} |
650 | 0 | if (oocEngine != null) { |
651 | 0 | oocEngine.gcCompleted(info); |
652 | |
} |
653 | |
} |
654 | 0 | } |
655 | |
}; |
656 | |
|
657 | 0 | emitter.addNotificationListener(listener, null, null); |
658 | 0 | } |
659 | 0 | } |
660 | |
|
661 | |
|
662 | |
|
663 | |
|
664 | |
private void initializeAndConfigureLogging() { |
665 | |
|
666 | 0 | String logLevel = conf.getLocalLevel(); |
667 | 0 | if (!Logger.getRootLogger().getLevel().equals(Level.toLevel(logLevel))) { |
668 | 0 | Logger.getRootLogger().setLevel(Level.toLevel(logLevel)); |
669 | 0 | if (LOG.isInfoEnabled()) { |
670 | 0 | LOG.info("setup: Set log level to " + logLevel); |
671 | |
} |
672 | |
} else { |
673 | 0 | if (LOG.isInfoEnabled()) { |
674 | 0 | LOG.info("setup: Log level remains at " + logLevel); |
675 | |
} |
676 | |
} |
677 | |
|
678 | 0 | if (conf.useLogThreadLayout()) { |
679 | 0 | PatternLayout layout = |
680 | |
new PatternLayout("%-7p %d [%t] %c %x - %m%n"); |
681 | |
Enumeration<Appender> appenderEnum = |
682 | 0 | Logger.getRootLogger().getAllAppenders(); |
683 | 0 | while (appenderEnum.hasMoreElements()) { |
684 | 0 | appenderEnum.nextElement().setLayout(layout); |
685 | |
} |
686 | |
} |
687 | |
|
688 | |
|
689 | 0 | if (conf.getLocalTestMode()) { |
690 | 0 | LogManager.getLogger(org.apache.zookeeper.server.PrepRequestProcessor. |
691 | 0 | class.getName()).setLevel(Level.ERROR); |
692 | |
} |
693 | 0 | } |
694 | |
|
695 | |
|
696 | |
|
697 | |
|
698 | |
private void initJobMetrics() { |
699 | 0 | GiraphMetricsRegistry jobMetrics = GiraphMetrics.get().perJobOptional(); |
700 | 0 | wcPreAppTimer = new GiraphTimer(jobMetrics, "worker-context-pre-app", |
701 | |
TimeUnit.MILLISECONDS); |
702 | 0 | wcPostAppTimer = new GiraphTimer(jobMetrics, "worker-context-post-app", |
703 | |
TimeUnit.MILLISECONDS); |
704 | 0 | } |
705 | |
|
706 | |
@Override |
707 | |
public void newSuperstep(SuperstepMetricsRegistry superstepMetrics) { |
708 | 0 | superstepTimer = new GiraphTimer(superstepMetrics, |
709 | |
TIMER_SUPERSTEP_TIME, TimeUnit.MILLISECONDS); |
710 | 0 | computeAll = new GiraphTimer(superstepMetrics, |
711 | |
TIMER_COMPUTE_ALL, TimeUnit.MILLISECONDS); |
712 | 0 | timeToFirstMessage = new GiraphTimer(superstepMetrics, |
713 | |
TIMER_TIME_TO_FIRST_MSG, TimeUnit.MICROSECONDS); |
714 | 0 | communicationTimer = new GiraphTimer(superstepMetrics, |
715 | |
TIMER_COMMUNICATION_TIME, TimeUnit.MILLISECONDS); |
716 | 0 | gcTimeMetric = superstepMetrics.getCounter(TIMER_SUPERSTEP_GC_TIME); |
717 | 0 | wcPreSuperstepTimer = new GiraphTimer(superstepMetrics, |
718 | |
"worker-context-pre-superstep", TimeUnit.MILLISECONDS); |
719 | 0 | } |
720 | |
|
721 | |
|
722 | |
|
723 | |
|
724 | |
public void notifySentMessages() { |
725 | |
|
726 | |
|
727 | 0 | GiraphTimerContext tmp = timeToFirstMessageTimerContext; |
728 | 0 | if (tmp != null) { |
729 | 0 | synchronized (timeToFirstMessage) { |
730 | 0 | if (timeToFirstMessageTimerContext != null) { |
731 | 0 | timeToFirstMessageTimerContext.stop(); |
732 | 0 | timeToFirstMessageTimerContext = null; |
733 | 0 | communicationTimerContext = communicationTimer.time(); |
734 | |
} |
735 | 0 | } |
736 | |
} |
737 | 0 | } |
738 | |
|
739 | |
|
740 | |
|
741 | |
|
742 | |
|
743 | |
public void notifyFinishedCommunication() { |
744 | 0 | GiraphTimerContext tmp = communicationTimerContext; |
745 | 0 | if (tmp != null) { |
746 | 0 | synchronized (communicationTimer) { |
747 | 0 | if (communicationTimerContext != null) { |
748 | 0 | communicationTimerContext.stop(); |
749 | 0 | communicationTimerContext = null; |
750 | |
} |
751 | 0 | } |
752 | |
} |
753 | 0 | } |
754 | |
|
755 | |
|
756 | |
|
757 | |
|
758 | |
|
759 | |
|
760 | |
|
761 | |
|
762 | |
|
763 | |
private void processGraphPartitions(final Mapper<?, ?, ?, ?>.Context context, |
764 | |
List<PartitionStats> partitionStatsList, |
765 | |
final GraphState graphState, |
766 | |
final MessageStore<I, Writable> messageStore, |
767 | |
int numThreads) { |
768 | 0 | PartitionStore<I, V, E> partitionStore = serviceWorker.getPartitionStore(); |
769 | 0 | long verticesToCompute = 0; |
770 | 0 | for (Integer partitionId : partitionStore.getPartitionIds()) { |
771 | 0 | verticesToCompute += partitionStore.getPartitionVertexCount(partitionId); |
772 | 0 | } |
773 | 0 | WorkerProgress.get().startSuperstep( |
774 | 0 | serviceWorker.getSuperstep(), verticesToCompute, |
775 | 0 | serviceWorker.getPartitionStore().getNumPartitions()); |
776 | 0 | partitionStore.startIteration(); |
777 | |
|
778 | 0 | GiraphTimerContext computeAllTimerContext = computeAll.time(); |
779 | 0 | timeToFirstMessageTimerContext = timeToFirstMessage.time(); |
780 | |
|
781 | 0 | CallableFactory<Collection<PartitionStats>> callableFactory = |
782 | 0 | new CallableFactory<Collection<PartitionStats>>() { |
783 | |
@Override |
784 | |
public Callable<Collection<PartitionStats>> newCallable( |
785 | |
int callableId) { |
786 | 0 | return new ComputeCallable<I, V, E, Writable, Writable>( |
787 | |
context, |
788 | |
graphState, |
789 | |
messageStore, |
790 | 0 | conf, |
791 | 0 | serviceWorker); |
792 | |
} |
793 | |
}; |
794 | 0 | List<Collection<PartitionStats>> results = |
795 | 0 | ProgressableUtils.getResultsWithNCallables(callableFactory, numThreads, |
796 | |
"compute-%d", context); |
797 | |
|
798 | 0 | for (Collection<PartitionStats> result : results) { |
799 | 0 | partitionStatsList.addAll(result); |
800 | 0 | } |
801 | |
|
802 | 0 | computeAllTimerContext.stop(); |
803 | 0 | } |
804 | |
|
805 | |
|
806 | |
|
807 | |
|
808 | |
|
809 | |
|
810 | |
private boolean checkSuperstepRestarted(long superstep) throws IOException { |
811 | |
|
812 | |
|
813 | 0 | if (serviceWorker.getRestartedSuperstep() == superstep) { |
814 | 0 | if (LOG.isInfoEnabled()) { |
815 | 0 | LOG.info("execute: Loading from checkpoint " + superstep); |
816 | |
} |
817 | 0 | VertexEdgeCount vertexEdgeCount = serviceWorker.loadCheckpoint( |
818 | 0 | serviceWorker.getRestartedSuperstep()); |
819 | 0 | finishedSuperstepStats = new FinishedSuperstepStats(0, false, |
820 | 0 | vertexEdgeCount.getVertexCount(), vertexEdgeCount.getEdgeCount(), |
821 | |
false, CheckpointStatus.NONE); |
822 | 0 | return true; |
823 | |
} |
824 | 0 | return false; |
825 | |
} |
826 | |
|
827 | |
|
828 | |
|
829 | |
|
830 | |
|
831 | |
|
832 | |
|
833 | |
|
834 | |
private boolean storeCheckpoint(CheckpointStatus checkpointStatus) |
835 | |
throws IOException { |
836 | 0 | if (checkpointStatus != CheckpointStatus.NONE) { |
837 | 0 | serviceWorker.storeCheckpoint(); |
838 | |
} |
839 | 0 | return checkpointStatus == CheckpointStatus.CHECKPOINT_AND_HALT; |
840 | |
} |
841 | |
|
842 | |
|
843 | |
|
844 | |
|
845 | |
|
846 | |
|
847 | |
|
848 | |
|
849 | |
|
850 | |
private boolean collectInputSuperstepStats( |
851 | |
FinishedSuperstepStats inputSuperstepStats) { |
852 | 0 | if (inputSuperstepStats.getVertexCount() == 0 && |
853 | 0 | !inputSuperstepStats.mustLoadCheckpoint()) { |
854 | 0 | LOG.warn("map: No vertices in the graph, exiting."); |
855 | 0 | return true; |
856 | |
} |
857 | 0 | if (conf.metricsEnabled()) { |
858 | 0 | GiraphMetrics.get().perSuperstep().printSummary(System.err); |
859 | |
} |
860 | 0 | return false; |
861 | |
} |
862 | |
|
863 | |
|
864 | |
|
865 | |
|
866 | |
|
867 | |
private boolean checkTaskState() { |
868 | 0 | if (done) { |
869 | 0 | return true; |
870 | |
} |
871 | 0 | GiraphMetrics.get().resetSuperstepMetrics(BspService.INPUT_SUPERSTEP); |
872 | 0 | if (graphFunctions.isNotAWorker()) { |
873 | 0 | if (LOG.isInfoEnabled()) { |
874 | 0 | LOG.info("map: No need to do anything when not a worker"); |
875 | |
} |
876 | 0 | return true; |
877 | |
} |
878 | 0 | if (alreadyRun) { |
879 | 0 | throw new RuntimeException("map: In BSP, map should have only been" + |
880 | |
" run exactly once, (already run)"); |
881 | |
} |
882 | 0 | alreadyRun = true; |
883 | 0 | return false; |
884 | |
} |
885 | |
|
886 | |
|
887 | |
|
888 | |
|
889 | |
private void workerContextPreApp() { |
890 | 0 | GiraphTimerContext preAppTimerContext = wcPreAppTimer.time(); |
891 | |
try { |
892 | 0 | serviceWorker.getWorkerContext().preApplication(); |
893 | 0 | } catch (InstantiationException e) { |
894 | 0 | LOG.fatal("execute: preApplication failed in instantiation", e); |
895 | 0 | throw new RuntimeException( |
896 | |
"execute: preApplication failed in instantiation", e); |
897 | 0 | } catch (IllegalAccessException e) { |
898 | 0 | LOG.fatal("execute: preApplication failed in access", e); |
899 | 0 | throw new RuntimeException( |
900 | |
"execute: preApplication failed in access", e); |
901 | 0 | } |
902 | 0 | preAppTimerContext.stop(); |
903 | 0 | context.progress(); |
904 | |
|
905 | 0 | for (WorkerObserver obs : serviceWorker.getWorkerObservers()) { |
906 | 0 | obs.preApplication(); |
907 | 0 | context.progress(); |
908 | |
} |
909 | 0 | } |
910 | |
|
911 | |
|
912 | |
|
913 | |
|
914 | |
public void setupMapperObservers() { |
915 | 0 | mapperObservers = conf.createMapperObservers(context); |
916 | 0 | for (MapperObserver mapperObserver : mapperObservers) { |
917 | 0 | mapperObserver.setup(); |
918 | |
} |
919 | 0 | } |
920 | |
|
921 | |
|
922 | |
|
923 | |
|
924 | |
private void preLoadOnWorkerObservers() { |
925 | 0 | for (WorkerObserver obs : serviceWorker.getWorkerObservers()) { |
926 | 0 | obs.preLoad(); |
927 | 0 | context.progress(); |
928 | |
} |
929 | 0 | } |
930 | |
|
931 | |
|
932 | |
|
933 | |
|
934 | |
private void postSaveOnWorkerObservers() { |
935 | 0 | for (WorkerObserver obs : serviceWorker.getWorkerObservers()) { |
936 | 0 | obs.postSave(); |
937 | 0 | context.progress(); |
938 | |
} |
939 | 0 | } |
940 | |
|
941 | |
|
942 | |
|
943 | |
|
944 | |
public void cleanup() |
945 | |
throws IOException, InterruptedException { |
946 | 0 | if (LOG.isInfoEnabled()) { |
947 | 0 | LOG.info("cleanup: Starting for " + getGraphFunctions()); |
948 | |
} |
949 | 0 | jobProgressTracker.cleanup(); |
950 | 0 | if (done) { |
951 | 0 | return; |
952 | |
} |
953 | |
|
954 | 0 | if (serviceWorker != null) { |
955 | 0 | serviceWorker.cleanup(finishedSuperstepStats); |
956 | |
} |
957 | 0 | } |
958 | |
|
959 | |
|
960 | |
|
961 | |
|
962 | |
|
963 | |
public void sendWorkerCountersAndFinishCleanup() { |
964 | 0 | if (serviceWorker != null) { |
965 | 0 | postSaveOnWorkerObservers(); |
966 | 0 | serviceWorker.storeCountersInZooKeeper(true); |
967 | 0 | serviceWorker.closeZooKeeper(); |
968 | |
} |
969 | |
try { |
970 | 0 | if (masterThread != null) { |
971 | 0 | masterThread.join(); |
972 | 0 | LOG.info("cleanup: Joined with master thread"); |
973 | |
} |
974 | 0 | } catch (InterruptedException e) { |
975 | |
|
976 | 0 | LOG.error("cleanup: Master thread couldn't join"); |
977 | 0 | } |
978 | 0 | if (zkManager != null) { |
979 | 0 | LOG.info("cleanup: Offlining ZooKeeper servers"); |
980 | |
try { |
981 | 0 | zkManager.offlineZooKeeperServers(ZooKeeperManager.State.FINISHED); |
982 | |
|
983 | |
|
984 | |
|
985 | |
|
986 | |
|
987 | |
|
988 | |
|
989 | 0 | } catch (Throwable e) { |
990 | |
|
991 | 0 | LOG.error("cleanup: Error offlining zookeeper", e); |
992 | 0 | } |
993 | |
} |
994 | |
|
995 | |
|
996 | 0 | GiraphMetrics.get().shutdown(); |
997 | 0 | } |
998 | |
|
999 | |
|
1000 | |
|
1001 | |
|
1002 | |
|
1003 | |
public void zooKeeperCleanup() { |
1004 | 0 | if (graphFunctions.isZooKeeper()) { |
1005 | |
|
1006 | 0 | if (zkManager != null) { |
1007 | 0 | zkManager.cleanup(); |
1008 | |
} |
1009 | |
} |
1010 | 0 | } |
1011 | |
|
1012 | |
|
1013 | |
|
1014 | |
|
1015 | |
|
1016 | |
public void workerFailureCleanup() { |
1017 | |
try { |
1018 | 0 | if (graphFunctions.isWorker()) { |
1019 | 0 | serviceWorker.failureCleanup(); |
1020 | |
} |
1021 | |
|
1022 | 0 | GiraphMetrics.get().shutdown(); |
1023 | |
|
1024 | |
|
1025 | |
|
1026 | 0 | } catch (RuntimeException e1) { |
1027 | |
|
1028 | 0 | LOG.error("run: Worker failure failed on another RuntimeException, " + |
1029 | |
"original expection will be rethrown", e1); |
1030 | 0 | } |
1031 | 0 | } |
1032 | |
|
1033 | |
|
1034 | |
|
1035 | |
|
1036 | |
|
1037 | |
|
1038 | |
public Thread.UncaughtExceptionHandler createUncaughtExceptionHandler() { |
1039 | 0 | return new OverrideExceptionHandler( |
1040 | 0 | CHECKER_IF_WORKER_SHOULD_FAIL_AFTER_EXCEPTION_CLASS.newInstance( |
1041 | 0 | getConf()), getJobProgressTracker()); |
1042 | |
} |
1043 | |
|
1044 | |
|
1045 | |
|
1046 | |
|
1047 | |
|
1048 | |
|
1049 | |
|
1050 | |
|
1051 | |
public Thread.UncaughtExceptionHandler createUncaughtExceptionHandler( |
1052 | |
CheckerIfWorkerShouldFailAfterException checker) { |
1053 | 0 | return new OverrideExceptionHandler(checker, getJobProgressTracker()); |
1054 | |
} |
1055 | |
|
1056 | |
public ImmutableClassesGiraphConfiguration<I, V, E> getConf() { |
1057 | 0 | return conf; |
1058 | |
} |
1059 | |
|
1060 | |
|
1061 | |
|
1062 | |
|
1063 | |
public long getSuperstepGCTime() { |
1064 | 0 | return (gcTimeMetric == null) ? 0 : gcTimeMetric.count(); |
1065 | |
} |
1066 | |
|
1067 | |
|
1068 | |
|
1069 | |
|
1070 | |
|
1071 | |
|
1072 | |
|
1073 | |
|
1074 | |
public String getZookeeperList() { |
1075 | 0 | if (zkManager != null) { |
1076 | 0 | return zkManager.getZooKeeperServerPortString(); |
1077 | |
} else { |
1078 | 0 | return conf.getZookeeperList(); |
1079 | |
} |
1080 | |
} |
1081 | |
|
1082 | |
|
1083 | |
|
1084 | |
|
1085 | |
|
1086 | |
class OverrideExceptionHandler implements Thread.UncaughtExceptionHandler { |
1087 | |
|
1088 | |
private final CheckerIfWorkerShouldFailAfterException checker; |
1089 | |
|
1090 | |
private final JobProgressTracker jobProgressTracker; |
1091 | |
|
1092 | |
|
1093 | |
|
1094 | |
|
1095 | |
|
1096 | |
|
1097 | |
|
1098 | |
|
1099 | |
public OverrideExceptionHandler( |
1100 | |
CheckerIfWorkerShouldFailAfterException checker, |
1101 | 0 | JobProgressTracker jobProgressTracker) { |
1102 | 0 | this.checker = checker; |
1103 | 0 | this.jobProgressTracker = jobProgressTracker; |
1104 | 0 | } |
1105 | |
|
1106 | |
@Override |
1107 | |
public void uncaughtException(final Thread t, final Throwable e) { |
1108 | 0 | if (!checker.checkIfWorkerShouldFail(t, e)) { |
1109 | 0 | LOG.error( |
1110 | |
"uncaughtException: OverrideExceptionHandler on thread " + |
1111 | 0 | t.getName() + ", msg = " + e.getMessage(), e); |
1112 | 0 | return; |
1113 | |
} |
1114 | |
try { |
1115 | 0 | LOG.fatal( |
1116 | |
"uncaughtException: OverrideExceptionHandler on thread " + |
1117 | 0 | t.getName() + ", msg = " + e.getMessage() + ", exiting...", e); |
1118 | 0 | byte [] exByteArray = KryoWritableWrapper.convertToByteArray(e); |
1119 | 0 | jobProgressTracker.logError(ExceptionUtils.getStackTrace(e), |
1120 | |
exByteArray); |
1121 | 0 | zooKeeperCleanup(); |
1122 | 0 | workerFailureCleanup(); |
1123 | |
} finally { |
1124 | 0 | System.exit(1); |
1125 | 0 | } |
1126 | 0 | } |
1127 | |
} |
1128 | |
|
1129 | |
|
1130 | |
|
1131 | |
|
1132 | |
public interface CheckerIfWorkerShouldFailAfterException { |
1133 | |
|
1134 | |
|
1135 | |
|
1136 | |
|
1137 | |
|
1138 | |
|
1139 | |
|
1140 | |
boolean checkIfWorkerShouldFail(Thread thread, Throwable exception); |
1141 | |
} |
1142 | |
|
1143 | |
|
1144 | |
|
1145 | |
|
1146 | 0 | public static class FailWithEveryExceptionOccurred |
1147 | |
implements CheckerIfWorkerShouldFailAfterException { |
1148 | |
@Override |
1149 | |
public boolean checkIfWorkerShouldFail(Thread thread, Throwable exception) { |
1150 | 0 | return true; |
1151 | |
} |
1152 | |
} |
1153 | |
|
1154 | |
|
1155 | |
|
1156 | |
|
1157 | |
|
1158 | |
|
1159 | |
|
1160 | |
|
1161 | |
|
1162 | |
public static boolean isConnectionResetByPeer(Throwable throwable) { |
1163 | 0 | return throwable.getMessage().startsWith( |
1164 | |
"Connection reset by peer") ? true : false; |
1165 | |
} |
1166 | |
} |