Problem Statement: -

Return the IDs of the Facebook pages that have zero likes.
The output should be sorted in ascending order based on the page IDs.

Pasted image 20241225163122.png

Dataframe API Solution: -

pages_schema = StructType([ StructField("page_id", IntegerType(), True), StructField("page_name", StringType(), True) ]) 
# Define the schema for the page_likes table 
page_likes_schema = StructType([ StructField("user_id", IntegerType(), True), StructField("page_id", IntegerType(), True), StructField("liked_date", StringType(), True) ]) 
# Create an RDD with the data for pages 
pages_data = [ (20001, 'SQL Solutions'), (20045, 'Brain Exercises'), (20701, 'Tips for Data Analysts') ] # Create an RDD with the data for page_likes table
page_likes_data = [ (111, 20001, '2022-04-08 00:00:00'), (121, 20045, '2022-03-12 00:00:00'), (156, 20001, '2022-07-25 00:00:00') ]

page_df = spark.createDataFrame(pages_data,pages_schema)
likes_df = spark.createDataFrame(page_likes_data,page_likes_schema)

page_df.show()
likes_df.show()
+-------+--------------------+
|page_id|           page_name|
+-------+--------------------+
|  20001|       SQL Solutions|
|  20045|     Brain Exercises|
|  20701|Tips for Data Ana...|
+-------+--------------------+

+-------+-------+-------------------+
|user_id|page_id|         liked_date|
+-------+-------+-------------------+
|    111|  20001|2022-04-08 00:00:00|
|    121|  20045|2022-03-12 00:00:00|
|    156|  20001|2022-07-25 00:00:00|
+-------+-------+-------------------+

join()

result = page_df.join(likes_df,page_df.page_id == likes_df.page_id,"leftanti").select("page_id").orderBy("page_id")

result.show()
+-------+
|page_id|
+-------+
|  20701|
+-------+

Spark SQL Solution: -

page_df.createOrReplaceTempView("pgs")
likes_df.createOrReplaceTempView("lks")

result = spark.sql("""
SELECT
    p.page_id
FROM pgs p LEFT JOIN lks l ON p.page_id = l.page_id
WHERE l.page_id IS NULL
""")
result.show()