Thursday, October 22, 2015

Spark-SQL : how to query json files

scala> sqlContext
res0: org.apache.spark.sql.SQLContext = org.apache.spark.sql.hive.HiveContext@330305d3

scala> val df = sqlContext.load("org.apache.spark.sql.json", Map("path" -> "file:///employee.json"))
warning: there were 1 deprecation warning(s); re-run with -deprecation for details
df: org.apache.spark.sql.DataFrame = [birth_date: string, department_id: bigint, education_level: string, employee_id: bigint, end_date: string, first_name: string, full_name: string, gender: string, hire_date: string, last_name: string, management_role: string, marital_status: string, position_id: bigint, position_title: string, salary: double, store_id: bigint, supervisor_id: bigint]

scala>  df.printSchema()
 |-- birth_date: string (nullable = true)
 |-- department_id: long (nullable = true)
 |-- education_level: string (nullable = true)
 |-- employee_id: long (nullable = true)
 |-- end_date: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- full_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- hire_date: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- management_role: string (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- position_id: long (nullable = true)
 |-- position_title: string (nullable = true)
 |-- salary: double (nullable = true)
 |-- store_id: long (nullable = true)
 |-- supervisor_id: long (nullable = true)

scala> df.registerTempTable("employee")
scala> val names = sqlContext.sql("select first_name from employee limit 5")
names: org.apache.spark.sql.DataFrame = [first_name: string]
scala> names.foreach(println)

