spark SQL 条件过滤
数据准备
val arrayStructureData = Seq(
Row(Row("James", "", "Smith"), List("Java", "Scala", "C++"), "OH", "M"),
Row(Row("Anna", "Rose", ""), List("Spark", "Java", "C++"), "NY", "F"),
Row(Row("Julia", "", "Williams"), List("CSharp", "VB"), "OH", "F"),
Row(Row("Maria", "Anne", "Jones"), List("CSharp", "VB"), "NY", "M"),
Row(Row("Jen", "Mary", "Brown"), List("CSharp", "VB"), "NY", "M"),
Row(Row("Mike", "Mary", "Williams"), List("Python", "VB"), "OH", "M")
)
val arrayStructureSchema = new StructType()
.add("name", new StructType()
.add("firstname", StringType)
.add("middlename", StringType)
.add("lastname", StringType))
.add("languages", ArrayType(StringType))
.add("state", StringType)
.add("gender", StringType)
val df = spark.createDataFrame(
spark.sparkContext.parallelize(arrayStructureData), arrayStructureSchema)
df.printSchema()
df.show()
root
|-- name: struct (nullable = true)
| |-- firstname: string (nullable = true)
| |-- middlename: string (nullable = true)
| |-- lastname: string (nullable = true)
|-- languages: array (nullable = true)
| |-- element: string (containsNull = true)
|-- state: string (nullable = true)
|-- gender: string (nullable = true)
+--------------------+------------------+-----+------+
| name| languages|state|gender|
+--------------------+------------------+-----+------+
| [James, , Smith]|[Java, Scala, C++]| OH| M|
| [Anna, Rose, ]|[Spark, Java, C++]| NY| F|
| [Julia, , Williams]| [CSharp, VB]| OH| F|
|[Maria, Anne, Jones]| [CSharp, VB]| NY| M|
| [Jen, Mary, Brown]| [CSharp, VB]| NY| M|
|[Mike, Mary, Will...| [Python, VB]| OH| M|
+--------------------+------------------+-----+------+
单个条件过滤
df.filter(df("state") === "OH").show(false)
+----------------------+------------------+-----+------+
|name |languages |state|gender|
+----------------------+------------------+-----+------+
|[James, , Smith] |[Java, Scala, C++]|OH |M |
|[Julia, , Williams] |[CSharp, VB] |OH |F |
|[Mike, Mary, Williams]|[Python, VB] |OH |M |
+----------------------+------------------+-----+------+
以下写法均能得到同样的结果:
df.filter('state === "OH").show(false)
df.filter($state === "OH").show(false)
df.filter(col("state") === "OH").show(false)
df.where(df("state") === "OH").show(false)
df.where('state === "OH").show(false)
df.where($state === "OH").show(false)
df.where(col("state") === "OH").show(false)
使用 SQL 表达式过滤
df.filter("gender == 'M'").show(false)
df.where("gender == 'M'").show(false)
+----------------------+------------------+-----+------+
|name |languages |state|gender|
+----------------------+------------------+-----+------+
|[James, , Smith] |[Java, Scala, C++]|OH |M |
|[Maria, Anne, Jones] |[CSharp, VB] |NY |M |
|[Jen, Mary, Brown] |[CSharp, VB] |NY |M |
|[Mike, Mary, Williams]|[Python, VB] |OH |M |
+----------------------+------------------+-----+------+
使用多个条件进行过滤
df.filter(df("state") === "OH" && df("gender") === "M").show(false)
+----------------------+------------------+-----+------+
|name |languages |state|gender|
+----------------------+------------------+-----+------+
|[James, , Smith] |[Java, Scala, C++]|OH |M |
|[Mike, Mary, Williams]|[Python, VB] |OH |M |
+----------------------+------------------+-----+------+
过滤数组列
import org.apache.spark.sql.functions.array_contains
df.filter(array_contains(df("languages"), "Java")).show(false)
+----------------+------------------+-----+------+
|name |languages |state|gender|
+----------------+------------------+-----+------+
|[James, , Smith]|[Java, Scala, C++]|OH |M |
|[Anna, Rose, ] |[Spark, Java, C++]|NY |F |
+----------------+------------------+-----+------+
根据列子集进行过滤
df.filter(df("name.lastname") === "Williams").show(false)
+----------------------+------------+-----+------+
|name |languages |state|gender|
+----------------------+------------+-----+------+
|[Julia, , Williams] |[CSharp, VB]|OH |F |
|[Mike, Mary, Williams]|[Python, VB]|OH |M |
+----------------------+------------+-----+------+