Problem Statement: -
Find all customers who never order anything.

Dataframe API Solution: -
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import *
# Define the schema for the "Customers"
customers_schema = StructType([
StructField("id", IntegerType(), True),
StructField("name", StringType(), True)
])
# Define data for the "Customers"
customers_data = [
(1, 'Joe'),
(2, 'Henry'),
(3, 'Sam'),
(4, 'Max')
]
# Define the schema for the "Orders"
orders_schema = StructType([
StructField("id", IntegerType(), True),
StructField("customerId", IntegerType(), True)
])
# Define data for the "Orders"
orders_data = [
(1, 3),
(2, 1)
]
cust_df = spark.createDataFrame(customers_data,customers_schema)
cust_df.show()
ords_df = spark.createDataFrame(orders_data,orders_schema)
ords_df.show()
+---+-----+
| id| name|
+---+-----+
| 1| Joe|
| 2|Henry|
| 3| Sam|
| 4| Max|
+---+-----+
+---+----------+
| id|customerId|
+---+----------+
| 1| 3|
| 2| 1|
+---+----------+
joined_df = cust_df.join(ords_df,cust_df.id == ords_df.id,"left")
joined_df.show()
result_df = joined_df.filter(ords_df["id"].isNull()) \
..select(cust_df["id"],"name")
result_df.show()
+---+-----+----+----------+
| id| name| id|customerId|
+---+-----+----+----------+
| 1| Joe| 1| 3|
| 2|Henry| 2| 1|
| 3| Sam|NULL| NULL|
| 4| Max|NULL| NULL|
+---+-----+----+----------+
+---+----+
| id|name|
+---+----+
| 3| Sam|
| 4| Max|
+---+----+
Direct solution,
leftanti join: -
result_df = cust_df.join(ords_df,cust_df.id == ords_df.id,"leftanti")
result_df.show()
+---+----+
| id|name|
+---+----+
| 3| Sam|
| 4| Max|
+---+----+
Spark SQL Solution: -
ords_df.createOrReplaceTempView("orders")
cust_df.createOrReplaceTempView("customers")
result_df = spark.sql("""
SELECT
c.id,
c.name
FROM customers c LEFT JOIN orders o ON c.id = o.id
WHERE o.id IS NULL
""")
result_df.show()