Problem Statement: -
Find min, max salary and format date like '01 Dec 2024' and like '2025/12/01'.
from pyspark.sql.types import *
from pyspark.sql.functions import *
data = [
[1, "Vikas", "Ahlawat", 600000.0, "2013-02-15 11:16:28.290", "IT", "Male"],
[2, "nikita", "Jain", 530000.0, "2014-01-09 17:31:07.793", "HR", "Female"],
[3, "Ashish", "Kumar", 1000000.0, "2014-01-09 10:05:07.793", "IT", "Male"],
[4, "Nikhil", "Sharma", 480000.0, "2014-01-09 09:00:07.793", "HR", "Male"],
[5, "anish", "kadian", 500000.0, "2014-01-09 09:31:07.793", "Payroll", "Male"],
]
# Create a schema for the DataFrame
schema = StructType([
StructField("EmployeeID", IntegerType(), True),
StructField("First_Name", StringType(), True),
StructField("Last_Name", StringType(), True),
StructField("Salary", FloatType(), True),
StructField("Joining_Date", StringType(), True),
StructField("Department", StringType(), True),
StructField("Gender", StringType(), True)
])
df = spark.createDataFrame(data,schema)
df.show(truncate=False)
df.printSchema()
+----------+----------+---------+---------+-----------------------+----------+------+
|EmployeeID|First_Name|Last_Name|Salary |Joining_Date |Department|Gender|
+----------+----------+---------+---------+-----------------------+----------+------+
|1 |Vikas |Ahlawat |600000.0 |2013-02-15 11:16:28.290|IT |Male |
|2 |nikita |Jain |530000.0 |2014-01-09 17:31:07.793|HR |Female|
|3 |Ashish |Kumar |1000000.0|2014-01-09 10:05:07.793|IT |Male |
|4 |Nikhil |Sharma |480000.0 |2014-01-09 09:00:07.793|HR |Male |
|5 |anish |kadian |500000.0 |2014-01-09 09:31:07.793|Payroll |Male |
+----------+----------+---------+---------+-----------------------+----------+------+
root
|-- EmployeeID: integer (nullable = true)
|-- First_Name: string (nullable = true)
|-- Last_Name: string (nullable = true)
|-- Salary: float (nullable = true)
|-- Joining_Date: string (nullable = true)
|-- Department: string (nullable = true)
|-- Gender: string (nullable = true)
to_timestamp(): - To remove the time, it has to be first converted to timstamp and then to date.
df_fixed = df.withColumn("Salary",col("Salary").cast("int")) \
.withColumn("Joining_Date",to_timestamp(col("Joining_Date"),"yyyy-MM-dd HH:mm:ss.SSS"))
df_fixed.show()
+----------+----------+---------+-------+--------------------+----------+------+
|EmployeeID|First_Name|Last_Name| Salary| Joining_Date|Department|Gender|
+----------+----------+---------+-------+--------------------+----------+------+
| 1| Vikas| Ahlawat| 600000|2013-02-15 11:16:...| IT| Male|
| 2| nikita| Jain| 530000|2014-01-09 17:31:...| HR|Female|
| 3| Ashish| Kumar|1000000|2014-01-09 10:05:...| IT| Male|
| 4| Nikhil| Sharma| 480000|2014-01-09 09:00:...| HR| Male|
| 5| anish| kadian| 500000|2014-01-09 09:31:...| Payroll| Male|
+----------+----------+---------+-------+--------------------+----------+------+
df_fixed_final = df_fixed.withColumn("Joining_Date",to_date("Joining_Date","yyyy-MM-dd"))
df_fixed_final.show()
df_fixed_final.printSchema()
+----------+----------+---------+-------+------------+----------+------+
|EmployeeID|First_Name|Last_Name| Salary|Joining_Date|Department|Gender|
+----------+----------+---------+-------+------------+----------+------+
| 1| Vikas| Ahlawat| 600000| 2013-02-15| IT| Male|
| 2| nikita| Jain| 530000| 2014-01-09| HR|Female|
| 3| Ashish| Kumar|1000000| 2014-01-09| IT| Male|
| 4| Nikhil| Sharma| 480000| 2014-01-09| HR| Male|
| 5| anish| kadian| 500000| 2014-01-09| Payroll| Male|
+----------+----------+---------+-------+------------+----------+------+
root
|-- EmployeeID: integer (nullable = true)
|-- First_Name: string (nullable = true)
|-- Last_Name: string (nullable = true)
|-- Salary: integer (nullable = true)
|-- Joining_Date: date (nullable = true)
|-- Department: string (nullable = true)
|-- Gender: string (nullable = true)
result = df_fixed_final.agg(
min("salary").alias("Min_Sal"),
max("salary").alias("Max_Sal")
).collect()
min_sal = result[0]["Min_Sal"]
max_sal = result[0]["Max_Sal"]
print(f"The minimum Salary = {min_sal} \nThe maximum Salary = {max_sal}")
The minimum Salary = 480000
The maximum Salary = 1000000
min_sal = df_fixed_final.agg(min("salary").alias("Min_Sal"))
min_sal.show()
+-------+
|Min_Sal|
+-------+
| 480000|
+-------+
df_fixed_date = df_fixed_final.withColumn("Joining_Date",date_format("Joining_Date","dd MMM yyyy"))
df_fixed.show()
df_fixed_date2 = df_fixed_final.withColumn("Joining_Date",date_format("Joining_Date","yyyy/MM/dd"))
df_fixed_date2.show()
+----------+----------+---------+-------+------------+----------+------+
|EmployeeID|First_Name|Last_Name| Salary|Joining_Date|Department|Gender|
+----------+----------+---------+-------+------------+----------+------+
| 1| Vikas| Ahlawat| 600000| 15 Feb 2013| IT| Male|
| 2| nikita| Jain| 530000| 09 Jan 2014| HR|Female|
| 3| Ashish| Kumar|1000000| 09 Jan 2014| IT| Male|
| 4| Nikhil| Sharma| 480000| 09 Jan 2014| HR| Male|
| 5| anish| kadian| 500000| 09 Jan 2014| Payroll| Male|
+----------+----------+---------+-------+------------+----------+------+
+----------+----------+---------+-------+------------+----------+------+
|EmployeeID|First_Name|Last_Name| Salary|Joining_Date|Department|Gender|
+----------+----------+---------+-------+------------+----------+------+
| 1| Vikas| Ahlawat| 600000| 2013/02/15| IT| Male|
| 2| nikita| Jain| 530000| 2014/01/09| HR|Female|
| 3| Ashish| Kumar|1000000| 2014/01/09| IT| Male|
| 4| Nikhil| Sharma| 480000| 2014/01/09| HR| Male|
| 5| anish| kadian| 500000| 2014/01/09| Payroll| Male|
+----------+----------+---------+-------+------------+----------+------+