Problem Statement: -

Report the device that is first logged in for each player.

Pasted image 20241225140131.png

Dataframe API Solution: -

# Define the schema for the "Activity" 
activity_schema = StructType([ StructField("player_id", IntegerType(), True), StructField("device_id", IntegerType(), True), StructField("event_date", StringType(), True), StructField("games_played", IntegerType(), True) ])
# Define data for the "Activity" 
activity_data = [ (1, 2, '2016-03-01', 5), (1, 2, '2016-05-02', 6), (2, 3, '2017-06-25', 1), (3, 1, '2016-03-02', 0), (3, 4, '2018-07-03', 5) ]

df = spark.createDataFrame(activity_data,activity_schema)

df = df.withColumn("event_date",to_date("event_date","yyyy-MM-dd"))
df.show()
df.printSchema()
+---------+---------+----------+------------+
|player_id|device_id|event_date|games_played|
+---------+---------+----------+------------+
|        1|        2|2016-03-01|           5|
|        1|        2|2016-05-02|           6|
|        2|        3|2017-06-25|           1|
|        3|        1|2016-03-02|           0|
|        3|        4|2018-07-03|           5|
+---------+---------+----------+------------+

root
 |-- player_id: integer (nullable = true)
 |-- device_id: integer (nullable = true)
 |-- event_date: date (nullable = true)
 |-- games_played: integer (nullable = true)
window_spec = Window.partitionBy("player_id").orderBy("event_date")
result_df = df.withColumn("Device_id",first_value("Device_id").over(window_spec)).select("player_id","device_id").distinct()

result_df.show()
+---------+---------+
|player_id|device_id|
+---------+---------+
|        1|        2|
|        2|        3|
|        3|        1|
+---------+---------+