Problem Statement: -

Given csv file
Pasted image 20241226162441.png

Count num of rows where null is present.

Dataframe API Solution: -

option(nullValue)

from pyspark.sql.functions import *

df = spark.read.option("nullValue","null").option("header",True).csv("Files/Book1.csv")
df.show()
df.printSchema()
+---+----+----+
| ID|NAME| Age|
+---+----+----+
|  1|   A|  23|
|  2|   B|NULL|
|  3|   C|  56|
|  4|NULL|NULL|
|  5|NULL|NULL|
+---+----+----+

root
 |-- ID: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- Age: string (nullable = true)
df_res = df.select([count(when(col(i).isNull(),i)).alias(i) for i in df.columns])
df_res.show()
+---+----+---+
| ID|NAME|Age|
+---+----+---+
|  0|   2|  3|
+---+----+---+

Spark SQL Solution: -

df.createOrReplaceTempView("cnt_null")

sql_query = """
SELECT 
    SUM(CASE WHEN ID IS NULL THEN 1 ELSE 0 END) AS ID,
    SUM(CASE WHEN NAME IS NULL THEN 1 ELSE 0 END) AS NAME,
    SUM(CASE WHEN Age IS NULL THEN 1 ELSE 0 END) AS Age
FROM cnt_null
"""

res = spark.sql(sql_query)
res.show()