Problem Statement: -

Find all customers who never order anything.

Pasted image 20241225124219.png

Dataframe API Solution: -

from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import *

# Define the schema for the "Customers"
customers_schema = StructType([
StructField("id", IntegerType(), True),
StructField("name", StringType(), True)
])
# Define data for the "Customers"
customers_data = [
(1, 'Joe'),
(2, 'Henry'),
(3, 'Sam'),
(4, 'Max')
]
# Define the schema for the "Orders"
orders_schema = StructType([
StructField("id", IntegerType(), True),
StructField("customerId", IntegerType(), True)
])
# Define data for the "Orders"
orders_data = [
(1, 3),
(2, 1)
]

cust_df = spark.createDataFrame(customers_data,customers_schema)
cust_df.show()

ords_df = spark.createDataFrame(orders_data,orders_schema)
ords_df.show()
+---+-----+
| id| name|
+---+-----+
|  1|  Joe|
|  2|Henry|
|  3|  Sam|
|  4|  Max|
+---+-----+

+---+----------+
| id|customerId|
+---+----------+
|  1|         3|
|  2|         1|
+---+----------+

join()

joined_df = cust_df.join(ords_df,cust_df.id == ords_df.id,"left")
joined_df.show()

result_df = joined_df.filter(ords_df["id"].isNull()) \
..select(cust_df["id"],"name")

result_df.show()
+---+-----+----+----------+
| id| name|  id|customerId|
+---+-----+----+----------+
|  1|  Joe|   1|         3|
|  2|Henry|   2|         1|
|  3|  Sam|NULL|      NULL|
|  4|  Max|NULL|      NULL|
+---+-----+----+----------+

+---+----+
| id|name|
+---+----+
|  3| Sam|
|  4| Max|
+---+----+

Direct solution,
leftanti join: -

result_df = cust_df.join(ords_df,cust_df.id == ords_df.id,"leftanti")
result_df.show()
+---+----+
| id|name|
+---+----+
|  3| Sam|
|  4| Max|
+---+----+

Spark SQL Solution: -

ords_df.createOrReplaceTempView("orders")
cust_df.createOrReplaceTempView("customers")

result_df = spark.sql("""
SELECT 
    c.id,
    c.name
FROM customers c LEFT JOIN orders o ON c.id = o.id
WHERE o.id IS NULL
""")

result_df.show()