Problem Statement: -

Pasted image 20241228163258.png
Given dataset, get seperate columns for first name and last name.

from pyspark.sql.types import *
from pyspark.sql.functions import *

data=[(1,"Sagar-Prajapati"),(2,"Alex-John"),(3,"John Cena"),(4,"Kim Joe")]
schema="ID int,Name string"
df=spark.createDataFrame(data,schema)

df.show()

regexp_replace()

df1 = df.withColumn("Name",regexp_replace(df.Name,r'(-)'," "))
df1.show()

+---+---------------+
| ID|           Name|
+---+---------------+
|  1|Sagar Prajapati|
|  2|      Alex John|
|  3|      John Cena|
|  4|        Kim Joe|
+---+---------------+

df2 = df1.withColumn("First_Name",split(col("Name")," ")[0]).withColumn("Surname",split(col("Name")," ")[1]).drop("Name")
df2.show()

+---+----------+---------+
| ID|First_Name|  Surname|
+---+----------+---------+
|  1|     Sagar|Prajapati|
|  2|      Alex|     John|
|  3|      John|     Cena|
|  4|       Kim|      Joe|
+---+----------+---------+