spark SQL 多元条件
数据准备
val data = List(("James", "", "Smith", "36636", "M", 60000),
("Michael", "Rose", "", "40288", "M", 70000),
("Robert", "", "Williams", "42114", "", 400000),
("Maria", "Anne", "Jones", "39192", "F", 500000),
("Jen", "Mary", "Brown", "", "F", 0))
val cols = Seq("first_name", "middle_name", "last_name", "dob", "gender", "salary")
val df = spark.createDataFrame(data).toDF(cols: _*)
df.show(false)
+----------+-----------+---------+-----+------+------+
|first_name|middle_name|last_name|dob |gender|salary|
+----------+-----------+---------+-----+------+------+
|James | |Smith |36636|M |60000 |
|Michael |Rose | |40288|M |70000 |
|Robert | |Williams |42114| |400000|
|Maria |Anne |Jones |39192|F |500000|
|Jen |Mary |Brown | |F |0 |
+----------+-----------+---------+-----+------+------+
when otherwise
val df2 = df.withColumn("new_gender",
when(col("gender") === "M", "Male").when(col("gender") === "F", "Female").otherwise("Unknown"))
df2.show(false)
+----------+-----------+---------+-----+------+------+----------+
|first_name|middle_name|last_name|dob |gender|salary|new_gender|
+----------+-----------+---------+-----+------+------+----------+
|James | |Smith |36636|M |60000 |Male |
|Michael |Rose | |40288|M |70000 |Male |
|Robert | |Williams |42114| |400000|Unknown |
|Maria |Anne |Jones |39192|F |500000|Female |
|Jen |Mary |Brown | |F |0 |Female |
+----------+-----------+---------+-----+------+------+----------+
case when
df.withColumn("new_gender",
expr("case when gender = 'M' then 'Male' " +
"when gender = 'F' then 'Female' " +
"else 'Unknown' end")).show(false)
+----------+-----------+---------+-----+------+------+----------+
|first_name|middle_name|last_name|dob |gender|salary|new_gender|
+----------+-----------+---------+-----+------+------+----------+
|James | |Smith |36636|M |60000 |Male |
|Michael |Rose | |40288|M |70000 |Male |
|Robert | |Williams |42114| |400000|Unknown |
|Maria |Anne |Jones |39192|F |500000|Female |
|Jen |Mary |Brown | |F |0 |Female |
+----------+-----------+---------+-----+------+------+----------+
组合使用 && 和 ||
val dataDF = Seq(
(66, "a", "4"), (67, "a", "0"), (70, "b", "4"), (71, "d", "4"
)).toDF("id", "code", "amt")
dataDF.show(false)
dataDF.withColumn("new_column",
when(col("code") === "a" || col("code") === "d", "A")
.when(col("code") === "b" && col("amt") === "4", "B")
.otherwise("A1")).show()
+---+----+---+
|id |code|amt|
+---+----+---+
|66 |a |4 |
|67 |a |0 |
|70 |b |4 |
|71 |d |4 |
+---+----+---+
+---+----+---+----------+
| id|code|amt|new_column|
+---+----+---+----------+
| 66| a| 4| A|
| 67| a| 0| A|
| 70| b| 4| B|
| 71| d| 4| A|
+---+----+---+----------+