Given nested json file like: -
%fs head Files/tetx.json
{
"id": "0001",
"type": "donut",
"name": "Cake",
"ppu": 0.55,
"batters":
{
"batter":
[
{ "id": "1001", "type": "Regular" },
{ "id": "1002", "type": "Chocolate" },
{ "id": "1003", "type": "Blueberry" },
{ "id": "1004", "type": "Devil's Food" }
]
},
"topping":
[
{ "id": "5001", "type": "None" },
{ "id": "5002", "type": "Glazed" },
{ "id": "5005", "type": "Sugar" },
{ "id": "5007", "type": "Powdered Sugar" },
{ "id": "5006", "type": "Chocolate with Sprinkles" },
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
]
}
Flatten it.
df = spark \
.read \
.format("json") \
.option("multiline",True) \
.load("Files/tetx.json")
df.show()
df.printSchema()
+--------------------+----+----+----+--------------------+-----+
| batters| id|name| ppu| topping| type|
+--------------------+----+----+----+--------------------+-----+
|{[{1001, Regular}...|0001|Cake|0.55|[{5001, None}, {5...|donut|
+--------------------+----+----+----+--------------------+-----+
root
|-- batters: struct (nullable = true)
| |-- batter: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- id: string (nullable = true)
| | | |-- type: string (nullable = true)
|-- id: string (nullable = true)
|-- name: string (nullable = true)
|-- ppu: double (nullable = true)
|-- topping: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- id: string (nullable = true)
| | |-- type: string (nullable = true)
|-- type: string (nullable = true)
df_final = df \
.withColumn("new_batters",explode("batters.batter"))\
.withColumn("new_topping",explode("topping")) \
.drop("topping","batters") \
.select("id","name","ppu","type", \
col("new_batters.id").alias("Btrs_id"), \
col("new_batters.type").alias("Btrs_type"), \
col("new_topping.id").alias("tpng_id"), \
col("new_topping.type").alias("tpng_type"))
df_final.show()
+----+----+----+-----+-------+---------+-------+--------------------+
| id|name| ppu| type|Btrs_id|Btrs_type|tpng_id| tpng_type|
+----+----+----+-----+-------+---------+-------+--------------------+
|0001|Cake|0.55|donut| 1001| Regular| 5001| None|
|0001|Cake|0.55|donut| 1001| Regular| 5002| Glazed|
|0001|Cake|0.55|donut| 1001| Regular| 5005| Sugar|
|0001|Cake|0.55|donut| 1001| Regular| 5007| Powdered Sugar|
|0001|Cake|0.55|donut| 1001| Regular| 5006|Chocolate with Sp...|
|0001|Cake|0.55|donut| 1001| Regular| 5003| Chocolate|
|0001|Cake|0.55|donut| 1001| Regular| 5004| Maple|
|0001|Cake|0.55|donut| 1002|Chocolate| 5001| None|
|0001|Cake|0.55|donut| 1002|Chocolate| 5002| Glazed|
|0001|Cake|0.55|donut| 1002|Chocolate| 5005| Sugar|
|0001|Cake|0.55|donut| 1002|Chocolate| 5007| Powdered Sugar|
|0001|Cake|0.55|donut| 1002|Chocolate| 5006|Chocolate with Sp...|
|0001|Cake|0.55|donut| 1002|Chocolate| 5003| Chocolate|
|0001|Cake|0.55|donut| 1002|Chocolate| 5004| Maple|
|0001|Cake|0.55|donut| 1003|Blueberry| 5001| None|
|0001|Cake|0.55|donut| 1003|Blueberry| 5002| Glazed|
|0001|Cake|0.55|donut| 1003|Blueberry| 5005| Sugar|
|0001|Cake|0.55|donut| 1003|Blueberry| 5007| Powdered Sugar|
|0001|Cake|0.55|donut| 1003|Blueberry| 5006|Chocolate with Sp...|
|0001|Cake|0.55|donut| 1003|Blueberry| 5003| Chocolate|
+----+----+----+-----+-------+---------+-------+--------------------+