저는 Apache Spark을 배우기 시작했습니다. 나는 링크의 출력을 인쇄하려고하지만 어떤 이유로 그것을 표시하지 않습니다. 나는 또한 links.collect(), display (links)를 시도했지만 그 중 아무 것도 작동하지 않는다. 어떤 도움을 주시면 감사하겠습니다. 두 번째 이미지에 대한Apache Spark 출력을 볼 수 없습니다.
전체 스택 추적 :
Py4JJavaError Traceback (most recent call last)
<ipython-input-34-01e857cfa45e> in <module>()
----> 1 for link in links.collect():
2 print("%s" %(link))
/databricks/spark/python/pyspark/rdd.py in collect(self)
769 """
770 with SCCallSiteSync(self.context) as css:
--> 771 port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
772 return list(_load_from_socket(port, self._jrdd_deserializer))
773
/databricks/spark/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
/databricks/spark/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 4 in stage 24.0 failed 1 times, most recent failure: Lost task 4.0 in stage 24.0 (TID 76, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/databricks/spark/python/pyspark/worker.py", line 111, in main
process()
File "/databricks/spark/python/pyspark/worker.py", line 106, in process
serializer.dump_stream(func(split_index, iterator), outfile)
File "/databricks/spark/python/pyspark/rdd.py", line 2346, in pipeline_func
return func(split, prev_func(split, iterator))
File "/databricks/spark/python/pyspark/rdd.py", line 2346, in pipeline_func
return func(split, prev_func(split, iterator))
File "/databricks/spark/python/pyspark/rdd.py", line 317, in func
return f(iterator)
File "/databricks/spark/python/pyspark/rdd.py", line 1776, in combineLocally
merger.mergeValues(iterator)
File "/databricks/spark/python/pyspark/shuffle.py", line 236, in mergeValues
for k, v in iterator:
File "<ipython-input-31-4b09041aa30b>", line 1, in <lambda>
File "<ipython-input-28-f43debc22073>", line 3, in parseNeighbors
File "/databricks/python/lib/python2.7/re.py", line 171, in split
return _compile(pattern, flags).split(string, maxsplit)
TypeError: expected string or buffer
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:342)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:46)
at org.apache.spark.scheduler.Task.run(Task.scala:96)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:222)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
do links.collect – banjara
@shekhar 나는 그것을 시도했다, 당신은 두 번째 이미지에서 볼 수있다. 여전히 작동하지 않습니다. –
전체 추적을 게시하십시오. 당신이 수집하기 전에 아무것도 실제로 계산되지 않습니다, 그래서 오류가 위의 가능성이 높습니다,하지만 결국에만 나타납니다. – marmouset