Problem Statement: -

Find all dates' Id with higher temperatures compared to its previous dates (yesterday).
Return the result table in any order.

Pasted image 20241225130711.png

Dataframe API Solution: -

to_date()

from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import *

# Define the schema for the "Weather" table 
weather_schema = StructType([ StructField("id", IntegerType(), True), StructField("recordDate", StringType(), True), StructField("temperature", IntegerType(), True) ])
# Define data for the "Weather" table 
weather_data = [ (1, '2015-01-01', 10), (2, '2015-01-02', 25), (3, '2015-01-03', 20), (4, '2015-01-04', 30) ]

w_df = spark.createDataFrame(weather_data,weather_schema)
w_df = w_df.withColumn("recordDate",to_date("recordDate","yyyy-MM-dd"))
w_df.show()
+---+----------+-----------+
| id|recordDate|temperature|
+---+----------+-----------+
|  1|2015-01-01|         10|
|  2|2015-01-02|         25|
|  3|2015-01-03|         20|
|  4|2015-01-04|         30|
+---+----------+-----------+

Window

Window_spec = Window.orderBy(col("recordDate").asc())

lag_wthr_df = wthr_df.withColumn("Prev_Day_temp",lag(col("temperature")).over(Window_spec))
lag_wthr_df.show()
+---+----------+-----------+-------------+
| id|recordDate|temperature|Prev_Day_temp|
+---+----------+-----------+-------------+
|  1|2015-01-01|         10|         NULL|
|  2|2015-01-02|         25|           10|
|  3|2015-01-03|         20|           25|
|  4|2015-01-04|         30|           20|
+---+----------+-----------+-------------+
result_df = lag_wthr_df.filter(col("temperature") > col("Prev_Day_temp")).select("id")
result_df.show()
+---+
| id|
+---+
|  2|
|  4|
+---+

Spark SQL Solution: -

wthr_df.createOrReplaceTempView("tmpr")

result_df = spark.sql("""
WITH CTE AS(
    SELECT
    id,
    temperature,
    lag(temperature) OVER(ORDER BY recorddate) AS Prev_day_temp
    FROM tmpr
    )

SELECT id
FROM CTE
WHERE temperature > Prev_day_temp
""")

result_df.show()