Problem Statement: -
Given a dataframe, count the number of nulls in each column.
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import *
data=[(1,'Sagar',23),(2,None,34),(None,None,40),(5,'Alex',None),(4,'Kim',None)]
df=spark.createDataFrame(data,schema="ID int,Name string,Age int")
df.show()
+----+-----+----+
| ID| Name| Age|
+----+-----+----+
| 1|Sagar| 23|
| 2| NULL| 34|
|NULL| NULL| 40|
| 5| Alex|NULL|
| 4| Kim|NULL|
+----+-----+----+
df_cnt = df.select(
sum(when(col("ID").isNull(),1).otherwise(0)).alias("ID"),
sum(when(col("Name").isNull(),1).otherwise(0)).alias("Name"),
sum(when(col("Age").isNull(),1).otherwise(0)).alias("Age")
)
df_cnt.show()
+---+----+---+
| ID|Name|Age|
+---+----+---+
| 1| 2| 2|
+---+----+---+
Parameterized: -
df_cnt_prmt = df.select([count(when(col(i).isNull(),i)).alias(i) for i in df.columns])
df_cnt_prmt.show()
Spark SQL Solution: -
df.createOrReplaceTempView("nuls")
sql_query = """
SELECT
SUM(CASE WHEN ID IS NULL THEN 1 ELSE 0 END) AS ID,
SUM(CASE WHEN Name IS NULL THEN 1 ELSE 0 END) AS Name,
SUM(CASE WHEN Age IS NULL THEN 1 ELSE 0 END) AS Age
FROM nuls
"""
res = spark.sql(sql_query)
res.show()