Problem Statement: -
Report the device that is first logged in for each player.

Dataframe API Solution: -
# Define the schema for the "Activity"
activity_schema = StructType([ StructField("player_id", IntegerType(), True), StructField("device_id", IntegerType(), True), StructField("event_date", StringType(), True), StructField("games_played", IntegerType(), True) ])
# Define data for the "Activity"
activity_data = [ (1, 2, '2016-03-01', 5), (1, 2, '2016-05-02', 6), (2, 3, '2017-06-25', 1), (3, 1, '2016-03-02', 0), (3, 4, '2018-07-03', 5) ]
df = spark.createDataFrame(activity_data,activity_schema)
df = df.withColumn("event_date",to_date("event_date","yyyy-MM-dd"))
df.show()
df.printSchema()
+---------+---------+----------+------------+
|player_id|device_id|event_date|games_played|
+---------+---------+----------+------------+
| 1| 2|2016-03-01| 5|
| 1| 2|2016-05-02| 6|
| 2| 3|2017-06-25| 1|
| 3| 1|2016-03-02| 0|
| 3| 4|2018-07-03| 5|
+---------+---------+----------+------------+
root
|-- player_id: integer (nullable = true)
|-- device_id: integer (nullable = true)
|-- event_date: date (nullable = true)
|-- games_played: integer (nullable = true)
window_spec = Window.partitionBy("player_id").orderBy("event_date")
result_df = df.withColumn("Device_id",first_value("Device_id").over(window_spec)).select("player_id","device_id").distinct()
result_df.show()
+---------+---------+
|player_id|device_id|
+---------+---------+
| 1| 2|
| 2| 3|
| 3| 1|
+---------+---------+