import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
# with above line we have created a spark session
#which is necessary every time we work with pyspark
df_pyspark=spark.read.option('header','true').csv('/path/name.csv')
df_pyspark=spark.read.option('header','true').csv('/path/name.csv', inferSchema=True)
df_pyspark= df_pyspark.na.drop()
df_pyspark= df_pyspark.na.drop(how='all')
df_pyspark= df_pyspark.na.drop(how='any', thresh=2)
df_pyspark= df_pyspark.na.drop(how='any', subset=['column_name'])
df_pyspark= df_pyspark.na.fill('value_to_fill', ['column_name1', 'column_name2'])
df_avg_sal= df_pyspark.groupby("department").agg(avg("salary").alias("avg_salary")
df_avg_sal.show()
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
ranking = Window.partitionBy("department").orderBy("salary")
top_sal_emp= df_pyspark.withColumn("Rank", row_number().over(ranking))
top_sal_emp.filter(top_sal_emp.Rank==1).show()
df_top_marketer= df_pyspark.filter( (df_pyspark.salary>50000) & (df_pyspark.department=="Marketing"))
df_top_marketer.show()
from pyspark.sql.window import Window
from pyspark.sql.functions import col
partition_col = Window.partitionBy("department")
avg_sal_emp= df_pyspark.withColumn("avg_sal", avg(col("salary")).over(partition_col))
diff_sal_comp = avg_sal_emp.withColumn("sal_diff", col("salary") - col("avg_sal"))
diff_sal_comp.show()
total_sal = df_pyspark.filter(col("name").startswith("A")).agg(sum(col("salary")).alias("total_salary"))
total_sal.show()
sales_df = sales_df.withColumn("new_price", sales_df.price.substr(2,100).cast('float') )
sales_df.show()
emp_df = emp_df.sort(col('salary').desc())
emp_df.show()
df.distinct().count().show()
df = df.withColumnRenamed('name', 'first_name')
df=spark.read.option('header','true').csv('/path/name.csv', inferSchema=True)
df.registerTempTable("temp_table")
query=spark.sql(' select * from temp_table')
query.show()
df.write.csv('abc.csv')
df.write.json('abc.json')
df.write.parquet('abc.parquet')
df_new = df.groupby("emp_name").pivot("day").agg(first(df.time_spent))
df_new.show()
df_normalized = df.select("Id",explode("Items").alias("Item") )
df_normalized.show()
from pyspark.sql.functions import col, explode, split
df1 = spark.createDataFrame([
(1, "Adam, Amani , Singh"),
(2, "Ram"),
(3, "Shiv, Gupta")
], schema='Id long, Name string')
df2= df1.select( col("Id"), explode(split(col("Name"), ",") ).alias("Name") )
df1.show()
df2.show()