data = [
(1, "Sagar", 23, "Male", 68.0),
(2, "Kim", 35, "Female", 90.2),
(3, "Alex", 40, "Male", 79.1),
]
schema = "Id int,Name string,Age int,Gender string,Marks float"
df = spark.createDataFrame(data, schema)
df.show()
df.printSchema()
+---+-----+---+------+-----+
| Id| Name|Age|Gender|Marks|
+---+-----+---+------+-----+
| 1|Sagar| 23| Male| 68.0|
| 2| Kim| 35|Female| 90.2|
| 3| Alex| 40| Male| 79.1|
+---+-----+---+------+-----+
root
|-- Id: integer (nullable = true)
|-- Name: string (nullable = true)
|-- Age: integer (nullable = true)
|-- Gender: string (nullable = true)
|-- Marks: float (nullable = true)
for i in df.dtypes:
print(i)
('Id', 'int')
('Name', 'string')
('Age', 'int')
('Gender', 'string')
('Marks', 'float')
set_dtypes = set(i[1] for i in df.dtypes)
print(set_dtypes)
{'string', 'float', 'int'}
for i in set_dtypes:
cols = []
for j in df.dtypes:
if(i==j[1]):
cols.append(j[0])
df.select(cols).write.mode("overwrite").save(f"Files/Output/{i}")