download the script
wget <http://media.sundog-soft.com/hadoop/CassandraSpark.py>
export SPARK_MAJOR_VERSION=2
from pyspark.sql import functions, Row, SparkSession
def parseInput(line):
fields = line.split('|')
return Row(user_id = int(fields[0]), age = int(fields[1]), gender = fields[2],\\
occupation = fields[3], zip = fields[4])
if __name__ == '__main__':
spark = SparkSession.builder.appName('CassandraIntegration')\\
.config('spark.cassandra.connection.host', '127.0.0.1')\\ # cassandra 주소
.getOrCreate()
lines = spark.sparkContext.textFile('hdfs://user/maria_dev/ml-100k/u.user')
users = lines.map(parseInput)
usersDataset = spark.createDataFrame(users)
userDataset.write\\
.format('org.apache.spark.sql.cassandra')\\
.mode('append')\\
.options(table='users', keyspace='movielens')\\
.save()
readUsers = spark.read\\
.format('org.apache.spark.sql.cassandra')\\
.options(table='users', keyspace='movielens')\\
.load()
readUsers.createOrReplcaeTempView('users')
sqlDF = spark.sql('SELECT * FROM users WHERE age < 20')
sqlDF.show()
spark.stop()
spark-submit --packages datastax:spark-cassandra-connector:2.0.0-M2-s_2.11 \\
CassandraSpark.py
cqlsh --cqlversion="3.4.0"
USE movielens;
SELECT * FROM users LIMIT 10;