Friday, October 23, 2015

Saprk-Sql : How to query a csv file.

$/spark-1.4.1/bin/spark-shell --packages com.databricks:spark-csv_2.10:1.1.0
scala> sqlContext
res0: org.apache.spark.sql.SQLContext = org.apache.spark.sql.hive.HiveContext@12c57a43

scala> val df = sqlContext.load("com.databricks.spark.csv", Map("path" -> "file:///root/emp.csv","header"->"true"))
warning: there were 1 deprecation warning(s); re-run with -deprecation for details
df: org.apache.spark.sql.DataFrame = [emp_id: string, emp_name: string, country: string, salary: string]

scala>   df.printSchema()
root
 |-- emp_id: string (nullable = true)
 |-- emp_name: string (nullable = true)
 |-- country: string (nullable = true)
 |-- salary: string (nullable = true)


scala> df.registerTempTable("emp")

scala> val names = sqlContext.sql("select emp_name from emp")
names: org.apache.spark.sql.DataFrame = [emp_name: string]

scala> names.foreach(println)
[Guy]
[Jonas]
[Hector]
[Byron]
[Owen]
[Zachery]
[Alden]
[Akeem]

No comments: