Problem Statement: -

Report the name and bonus amount of each employee with a bonus less than 1000.

Pasted image 20241225141158.png

Dataframe API Solution: -

employee_schema = StructType([ StructField("empId", IntegerType(), True), StructField("name", StringType(), True), StructField("supervisor", IntegerType(), True), StructField("salary", IntegerType(), True) ])
# Define data for the "Employee" 
employee_data = [ (3, 'Brad', None, 4000), (1, 'John', 3, 1000), (2, 'Dan', 3, 2000), (4, 'Thomas', 3, 4000) ]

bonus_schema = StructType([ StructField("empId", IntegerType(), True), StructField("bonus", IntegerType(), True) ])
# Define data for the "Bonus" 
bonus_data = [ (2, 500), (4, 2000) ]

emp_df = spark.createDataFrame(employee_data,employee_schema)
bns_df = spark.createDataFrame(bonus_data,bonus_schema)

emp_df.show()
bns_df.show()
+-----+------+----------+------+
|empId|  name|supervisor|salary|
+-----+------+----------+------+
|    3|  Brad|      NULL|  4000|
|    1|  John|         3|  1000|
|    2|   Dan|         3|  2000|
|    4|Thomas|         3|  4000|
+-----+------+----------+------+

+-----+-----+
|empId|bonus|
+-----+-----+
|    2|  500|
|    4| 2000|
+-----+-----+

join()

joined_df = bns_df.join(emp_df,bns_fltd_df.empId == emp_df.empId,"right")

result_df = joined_df.select("name","bonus").filter((col("bonus") <= 1000) | (col("bonus").isNull()))
result_df.show()
+----+-----+
|name|bonus|
+----+-----+
|Brad| NULL|
|John| NULL|
| Dan|  500|
+----+-----+

Spark SQL Solution: -

emp_df.createOrReplaceTempView("emp")
bns_df.createOrReplaceTempView("bns")

result_df = spark.sql("""
SELECT 
    e.name,
    b.bonus
FROM emp e LEFT JOIN bns b ON e.empId = b.empId
WHERE Bonus < 1000 OR Bonus IS NULL
""")

result_df.show()