Problem Statement: -
Given dataframe, get street and city in different columns.
data=[('John Doe','{"street": "123 Main St", "city": "Anytown"}'),('Jane Smith','{"street": "456 Elm St", "city": "Othertown"}')]
df=spark.createDataFrame(data,schema="name string,address string")
df.show(truncate=False)
+----------+---------------------------------------------+
|name |address |
+----------+---------------------------------------------+
|John Doe |{"street": "123 Main St", "city": "Anytown"} |
|Jane Smith|{"street": "456 Elm St", "city": "Othertown"}|
+----------+---------------------------------------------+
df_new = df \
.withColumn("parsed_json",from_json(col("address"),'street string,city string')).drop("address")
df_new.show(truncate=False)
df_new.printSchema()
+----------+-----------------------+
|name |parsed_json |
+----------+-----------------------+
|John Doe |{123 Main St, Anytown} |
|Jane Smith|{456 Elm St, Othertown}|
+----------+-----------------------+
root
|-- name: string (nullable = true)
|-- parsed_json: struct (nullable = true)
| |-- street: string (nullable = true)
| |-- city: string (nullable = true)
df_result = df_new.select(col("name"),col("parsed_json").street.alias("Street"),col("parsed_json").city.alias("City"))
df_result.show()
+----------+-----------+---------+
| name| Street| City|
+----------+-----------+---------+
| John Doe|123 Main St| Anytown|
|Jane Smith| 456 Elm St|Othertown|
+----------+-----------+---------+