如何解决使用Spark通过nltk
我在进行火花和文本挖掘时遇到问题。请帮帮我。我附上了所有错误以便更好地识别。我找不到用于调试此错误的任何内容。我不知道为什么当我输入word.collect()时Python没有回答。 我下载了Apache Hadoop 2.6的火花并解压缩。我试图打开这段代码,但抛出一个错误。
这是我的代码:
from pyspark import SparkConf
from pyspark import SparkContext
conf = SparkConf()
conf.setAppName('spark-NLTK')
sc = SparkContext.getOrCreate();
import nltk
data = sc.textFile('c:/Users/Ramin/Desktop/Nixon.txt')
#word tokenization
def word_tokenize(x):
lowerW = x.lower()
return nltk.word_tokenize(x)
words = data.flatMap(word_tokenize)
words.collect()
我收到此错误:
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 0.0 failed 1 times,most recent failure: Lost task 1.0 in stage 0.0 (TID 1,localhost,executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py",line 364,in main
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py",line 69,in read_command
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py",line 173,in _read_with_length
return self.loads(obj)
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py",line 587,in loads
return pickle.loads(obj,encoding=encoding)
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\cloudpickle.py",line 875,in subimport
__import__(name)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\__init__.py",line 143,in <module>
from nltk.chunk import *
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\chunk\__init__.py",line 157,in <module>
from nltk.chunk.api import ChunkParserI
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\chunk\api.py",line 13,in <module>
from nltk.parse import ParserI
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\parse\__init__.py",line 100,in <module>
from nltk.parse.transitionparser import TransitionParser
File "C:\Users\Ramin\Anaconda3\lib\site-packages\nltk\parse\transitionparser.py",line 22,in <module>
from sklearn.datasets import load_svmlight_file
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\datasets\__init__.py",in <module>
from .twenty_newsgroups import fetch_20newsgroups
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\datasets\twenty_newsgroups.py",line 44,in <module>
from ..feature_extraction.text import CountVectorizer
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\feature_extraction\__init__.py",line 10,in <module>
from . import text
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py",line 28,in <module>
from ..preprocessing import normalize
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\preprocessing\__init__.py",line 6,in <module>
from ._function_transformer import FunctionTransformer
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\preprocessing\_function_transformer.py",line 5,in <module>
from ..utils.testing import assert_allclose_dense_sparse
File "C:\Users\Ramin\Anaconda3\lib\site-packages\sklearn\utils\testing.py",line 718,in <module>
import pytest
File "C:\Users\Ramin\Anaconda3\lib\site-packages\pytest.py",in <module>
from _pytest.assertion import register_assert_rewrite
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\assertion\__init__.py",line 7,in <module>
from _pytest.assertion import rewrite
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\assertion\rewrite.py",line 26,in <module>
from _pytest.assertion import util
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\assertion\util.py",line 8,in <module>
import _pytest._code
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\_code\__init__.py",line 2,in <module>
from .code import Code # noqa
File "C:\Users\Ramin\Anaconda3\lib\site-packages\_pytest\_code\code.py",line 23,in <module>
import pluggy
File "C:\Users\Ramin\Anaconda3\lib\site-packages\pluggy\__init__.py",line 16,in <module>
from .manager import PluginManager,PluginValidationError
File "C:\Users\Ramin\Anaconda3\lib\site-packages\pluggy\manager.py",line 11,in <module>
import importlib_metadata
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py",line 547,in <module>
__version__ = version(__name__)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py",line 509,in version
return distribution(distribution_name).version
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py",line 482,in distribution
return Distribution.from_name(distribution_name)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py",line 183,in from_name
dist = next(dists,None)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py",line 425,in <genexpr>
for path in map(cls._switch_path,paths)
File "C:\Users\Ramin\Anaconda3\lib\site-packages\importlib_metadata\__init__.py",line 449,in _search_path
if not root.is_dir():
File "C:\Users\Ramin\Anaconda3\lib\pathlib.py",line 1358,in is_dir
return S_ISDIR(self.stat().st_mode)
File "C:\Users\Ramin\Anaconda3\lib\pathlib.py",line 1168,in stat
return self._accessor.stat(self)
OSError: [WinError 123] The filename,directory name,or volume label syntax is incorrect: 'C:\\C:\\Bigdata\\SPARK\\spark-2.4.6-bin-hadoop2.7\\jars\\spark-core_2.11-2.4.6.jar'
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$15.apply(RDD.scala:990)
at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$15.apply(RDD.scala:990)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\Bigdata\SPARK\spark-2.4.6-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py",or volume label syntax is incorrect: 'C:\\C:\\Bigdata\\SPARK\\spark-2.4.6-bin-hadoop2.7\\jars\\spark-core_2.11-2.4.6.jar'
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$15.apply(RDD.scala:990)
at org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$15.apply(RDD.scala:990)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:123)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
... 1 more
解决方法
Spark配置的某些部分不正确,因为它认为需要将C驱动器前置两次到Spark库的路径
C:\\C:\\Bigdata\\SPARK
因此,在尝试将代码插入其他地方之前,我将尝试运行包含SparkPi
之类的Spark示例代码
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。