| from pyspark.sql.types import *
Schema=StructType([
  StructField(‘Store’,StringType(),nullable=True),
  StructField(‘StoreType’,StringType(),nullable=True),
  StructField(‘Assortment’,StringType(),nullable=True),
  StructField(‘CompetitionDistance’,FloatType(),nullable=True),
  StructField(‘CompetitionOpenSinceMonth’,IntegerType(),nullable=True),  StructField(‘CompetitionOpenSinceYear’,IntegerType(),nullable=True),  StructField(‘Promo2’,IntegerType(),nullable=True),
  StructField(‘Promo2SinceWeek’,IntegerType(),nullable=True),
  StructField(‘Promo2SinceYear’,IntegerType(),nullable=True),
  StructField(‘PromoInterval’,StringType(),nullable=True)
])
df = spark.read.option(“header”,True).schema(Schema).csv(‘store.csv’)
# We can drop invalid rows while reading the dataset by setting the read mode as “DROPMALFORMED”
df_1=spark.read.option(“header”,True).option(“mode”,’DROPMALFORMED’).csv(‘store.csv’)
df.show()
 | 
                            
Created By
Metadata
Comments
No comments yet. Add yours below!
Add a Comment
Related Cheat Sheets
More Cheat Sheets by datamansam