Problem Statement: -

Find min, max salary and format date like '01 Dec 2024' and like '2025/12/01'.

from pyspark.sql.types import *
from pyspark.sql.functions import *

data = [
[1, "Vikas", "Ahlawat", 600000.0, "2013-02-15 11:16:28.290", "IT", "Male"],
[2, "nikita", "Jain", 530000.0, "2014-01-09 17:31:07.793", "HR", "Female"],
[3, "Ashish", "Kumar", 1000000.0, "2014-01-09 10:05:07.793", "IT", "Male"],
[4, "Nikhil", "Sharma", 480000.0, "2014-01-09 09:00:07.793", "HR", "Male"],
[5, "anish", "kadian", 500000.0, "2014-01-09 09:31:07.793", "Payroll", "Male"],
]
# Create a schema for the DataFrame
schema = StructType([
StructField("EmployeeID", IntegerType(), True),
StructField("First_Name", StringType(), True),
StructField("Last_Name", StringType(), True),
StructField("Salary", FloatType(), True),
StructField("Joining_Date", StringType(), True),
StructField("Department", StringType(), True),
StructField("Gender", StringType(), True)
])

df = spark.createDataFrame(data,schema)
df.show(truncate=False)
df.printSchema()
+----------+----------+---------+---------+-----------------------+----------+------+
|EmployeeID|First_Name|Last_Name|Salary   |Joining_Date           |Department|Gender|
+----------+----------+---------+---------+-----------------------+----------+------+
|1         |Vikas     |Ahlawat  |600000.0 |2013-02-15 11:16:28.290|IT        |Male  |
|2         |nikita    |Jain     |530000.0 |2014-01-09 17:31:07.793|HR        |Female|
|3         |Ashish    |Kumar    |1000000.0|2014-01-09 10:05:07.793|IT        |Male  |
|4         |Nikhil    |Sharma   |480000.0 |2014-01-09 09:00:07.793|HR        |Male  |
|5         |anish     |kadian   |500000.0 |2014-01-09 09:31:07.793|Payroll   |Male  |
+----------+----------+---------+---------+-----------------------+----------+------+

root
 |-- EmployeeID: integer (nullable = true)
 |-- First_Name: string (nullable = true)
 |-- Last_Name: string (nullable = true)
 |-- Salary: float (nullable = true)
 |-- Joining_Date: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Gender: string (nullable = true)

to_timestamp(): - To remove the time, it has to be first converted to timstamp and then to date.

df_fixed = df.withColumn("Salary",col("Salary").cast("int")) \
.withColumn("Joining_Date",to_timestamp(col("Joining_Date"),"yyyy-MM-dd HH:mm:ss.SSS"))

df_fixed.show()
+----------+----------+---------+-------+--------------------+----------+------+
|EmployeeID|First_Name|Last_Name| Salary|        Joining_Date|Department|Gender|
+----------+----------+---------+-------+--------------------+----------+------+
|         1|     Vikas|  Ahlawat| 600000|2013-02-15 11:16:...|        IT|  Male|
|         2|    nikita|     Jain| 530000|2014-01-09 17:31:...|        HR|Female|
|         3|    Ashish|    Kumar|1000000|2014-01-09 10:05:...|        IT|  Male|
|         4|    Nikhil|   Sharma| 480000|2014-01-09 09:00:...|        HR|  Male|
|         5|     anish|   kadian| 500000|2014-01-09 09:31:...|   Payroll|  Male|
+----------+----------+---------+-------+--------------------+----------+------+
df_fixed_final = df_fixed.withColumn("Joining_Date",to_date("Joining_Date","yyyy-MM-dd"))

df_fixed_final.show()
df_fixed_final.printSchema()
+----------+----------+---------+-------+------------+----------+------+
|EmployeeID|First_Name|Last_Name| Salary|Joining_Date|Department|Gender|
+----------+----------+---------+-------+------------+----------+------+
|         1|     Vikas|  Ahlawat| 600000|  2013-02-15|        IT|  Male|
|         2|    nikita|     Jain| 530000|  2014-01-09|        HR|Female|
|         3|    Ashish|    Kumar|1000000|  2014-01-09|        IT|  Male|
|         4|    Nikhil|   Sharma| 480000|  2014-01-09|        HR|  Male|
|         5|     anish|   kadian| 500000|  2014-01-09|   Payroll|  Male|
+----------+----------+---------+-------+------------+----------+------+

root
 |-- EmployeeID: integer (nullable = true)
 |-- First_Name: string (nullable = true)
 |-- Last_Name: string (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Joining_Date: date (nullable = true)
 |-- Department: string (nullable = true)
 |-- Gender: string (nullable = true)
result = df_fixed_final.agg(
    min("salary").alias("Min_Sal"),
    max("salary").alias("Max_Sal")
    ).collect()

min_sal = result[0]["Min_Sal"]
max_sal = result[0]["Max_Sal"]

print(f"The minimum Salary = {min_sal} \nThe maximum Salary = {max_sal}")
The minimum Salary = 480000 
The maximum Salary = 1000000
min_sal = df_fixed_final.agg(min("salary").alias("Min_Sal"))
min_sal.show()
+-------+
|Min_Sal|
+-------+
| 480000|
+-------+

date_format()

df_fixed_date = df_fixed_final.withColumn("Joining_Date",date_format("Joining_Date","dd MMM yyyy"))
df_fixed.show()

df_fixed_date2 = df_fixed_final.withColumn("Joining_Date",date_format("Joining_Date","yyyy/MM/dd"))
df_fixed_date2.show()
+----------+----------+---------+-------+------------+----------+------+
|EmployeeID|First_Name|Last_Name| Salary|Joining_Date|Department|Gender|
+----------+----------+---------+-------+------------+----------+------+
|         1|     Vikas|  Ahlawat| 600000| 15 Feb 2013|        IT|  Male|
|         2|    nikita|     Jain| 530000| 09 Jan 2014|        HR|Female|
|         3|    Ashish|    Kumar|1000000| 09 Jan 2014|        IT|  Male|
|         4|    Nikhil|   Sharma| 480000| 09 Jan 2014|        HR|  Male|
|         5|     anish|   kadian| 500000| 09 Jan 2014|   Payroll|  Male|
+----------+----------+---------+-------+------------+----------+------+

+----------+----------+---------+-------+------------+----------+------+
|EmployeeID|First_Name|Last_Name| Salary|Joining_Date|Department|Gender|
+----------+----------+---------+-------+------------+----------+------+
|         1|     Vikas|  Ahlawat| 600000|  2013/02/15|        IT|  Male|
|         2|    nikita|     Jain| 530000|  2014/01/09|        HR|Female|
|         3|    Ashish|    Kumar|1000000|  2014/01/09|        IT|  Male|
|         4|    Nikhil|   Sharma| 480000|  2014/01/09|        HR|  Male|
|         5|     anish|   kadian| 500000|  2014/01/09|   Payroll|  Male|
+----------+----------+---------+-------+------------+----------+------+