spark-dataframe-practise

spark-dataframe-practise

August 23, 2024

DataFrame Operations

DataFrame Operations
DataFrames are distributed collection of data which is mapped to named columns. Uses Spark SQL Optimizer engine for better processing of Data. DataFrame can be created 1. From RDD 2. From Data files 3. From Spark Session
Normal DataFrame 1. Creating DataFrame from RDD >>> empnamesrdd = sc.parallelize(['Sally Bowen','Larry Becton','Rose Angela','Michael Todd','Joe Kimberly','Frank Patrick', 'Bonnie Lisa']) >>> from pyspark.sql import Row >>> empnamesrdd.map(lambda d: Row(d)).toDF() DataFrame[_1: string] Without specifying schema >>> empnamesrdd.map(lambda d: Row(d)).toDF().show() +-------------+ \| _1\| +-------------+ \| Sally Bowen\| \| Larry Becton\| \| Rose Angela\| \| Michael Todd\| \| Joe Kimberly\| \|Frank Patrick\| \| Bonnie Lisa\| +-------------+ >>> empnamesrdd.map(lambda d: Row(d)).toDF(['names']).show() +-------------+ \| names\| +-------------+ \| Sally Bowen\| \| Larry Becton\| \| Rose Angela\| \| Michael Todd\| \| Joe Kimberly\| \|Frank Patrick\| \| Bonnie Lisa\| +-------------+ >>> stockscsv = spark.read.csv('file:///home/mkm/data/stocks_data.csv') >>> stockscsv.show() +------------+----------+------+------+------+----------+ \| _c0\| _c1\| _c2\| _c3\| _c4\| _c5\| +------------+----------+------+------+------+----------+ \|stock_symbol\|prev_close\| open\| high\| low\| date\| \| FT\| 255.01\|828.04\|361.58\|841.53\|2022-01-01\| \| COHR\| 445.76\|813.11\|937.33\|810.12\|2022-01-01\| \| CHH\| 428.5\|928.85\|929.36\|215.53\|2022-01-01\| \| GD\| 439.49\|574.96\|976.55\|772.67\|2022-01-01\| \| PSAT\| 276.16\|681.53\|328.67\|872.12\|2022-01-01\| +------------+----------+------+------+------+----------+ >>> stockscsv.schema StructType(List(StructField(_c0,StringType,true),StructField(_c1,StringType,true),StructField(_c2,StringType,true),StructField(_c3,StringType,true),StructField(_c4,StringType,true),StructField(_c5,StringType,true))) >>> stockscsv = spark.read.csv('file:///home/mkm/data/stocks_data.csv', inferSchema=True, header=True) >>> stockscsv.schema StructType(List(StructField(stock_symbol,StringType,true),StructField(prev_close,DoubleType,true),StructField(open,DoubleType,true),StructField(high,DoubleType,true),StructField(low,DoubleType,true),StructField(date,TimestampType,true))) >>> stockscsv = spark.read.csv('file:///home/mkm/data/stocks_data.csv', inferSchema=False, header=True) >>> stockscsv.schema StructType(List(StructField(stock_symbol,StringType,true),StructField(prev_close,StringType,true),StructField(open,StringType,true),StructField(high,StringType,true),StructField(low,StringType,true),StructField(date,StringType,true))) Make sure mysql connector jar on classpath or in SPARK_HOME/lib directory >>> sampledb_df = spark.read.jdbc('jdbc:mysql://localhost:3306/sampledb','sampletb',properties={'user':'root','password':'root'}) >>> sampledb_df.show() +------+ \|number\| +------+ \| 1\| \| 2\| \| 3\| \| 4\| \| 5\| +------+ >>> empsaldata=[(123,3434,'USD'),(456,43000,'INR'),(789,42353,'EUR'),(637,6500,'SGD'),(345,6000,'AED')] >>> empsaldf = spark.createDataFrame(empsaldata) >>> empsaldf.show() +---+-----+---+ \| _1\| _2\| _3\| +---+-----+---+ \|123\| 3434\|USD\| \|456\|43000\|INR\| \|789\|42353\|EUR\| \|637\| 6500\|SGD\| \|345\| 6000\|AED\| +---+-----+---+ >>> empsaldf = spark.createDataFrame(empsaldata,['id','salary','currency']) >>> empsaldf.show() +---+------+--------+ \| id\|salary\|currency\| +---+------+--------+ \|123\| 3434\| USD\| \|456\| 43000\| INR\| \|789\| 42353\| EUR\| \|637\| 6500\| SGD\| \|345\| 6000\| AED\| +---+------+--------+ >>> from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType >>> empsalschema = StructType([StructField('id',IntegerType(),False), ... StructField('salary',LongType(),False), ... StructField('currency',StringType(),False)]) >>> empsaldf = spark.createDataFrame(empsaldata,empsalschema) >>> empsaldf.show() +---+------+--------+ \| id\|salary\|currency\| +---+------+--------+ \|123\| 3434\| USD\| \|456\| 43000\| INR\| \|789\| 42353\| EUR\| \|637\| 6500\| SGD\| \|345\| 6000\| AED\| +---+------+--------+ >>> empsaldf.schema StructType(List(StructField(id,IntegerType,false),StructField(salary,LongType,false),StructField(currency,StringType,false))) >>> empsaldf.col empsaldf.colRegex( empsaldf.collect( empsaldf.columns >>> empsaldf.columns ['id', 'salary', 'currency'] >>> for col in empsaldf.columns: ... print(col) ... id salary currency >>> empsalnewcols = [('es_'+col) for col in empsaldf.columns] >>> empsalnewcols ['es_id', 'es_salary', 'es_currency'] >>> empsaldf.schema StructType(List(StructField(id,IntegerType,false),StructField(salary,LongType,false),StructField(currency,StringType,false))) >>> empsaloldcols = empsaldf.columns >>> empsaloldcols ['id', 'salary', 'currency'] >>> empsalnewcols ['es_id', 'es_salary', 'es_currency'] >>> for i in range(len(empsaloldcols)): ... print(empsaloldcols[i]) ... id salary currency >>> for i in range(len(empsaloldcols)): ... empsaldf = empsaldf.withColumnRenamed(empsaloldcols[i],empsalnewcols[i]) ... >>> empsaldf.schema StructType(List(StructField(es_id,IntegerType,false),StructField(es_salary,LongType,false),StructField(es_currency,StringType,false)))
DataFrames - sort, group by, order by, with expression, filter >>> emps = sc.parallelize([(1,'Avc Ser',2300,'L1'),(2,'Kghf Jfjs',3400,'L1'),(3,'Jgf Fjf',2455,'L2'),(4,'Ejh Ruyg',3500,'L3'),(5,'Efgw Ygefrf',5400,'L4'),(6,'Fuyg Ruyoi',4020,'L4'),(7,"Gfgd Tefdf",3500,'L4')]) >>> empdf = spark.createDataFrame(emps) >>> empdf.show() +---+-----------+----+---+ \| _1\| _2\| _3\| _4\| +---+-----------+----+---+ \| 1\| Avc Ser\|2300\| L1\| \| 2\| Kghf Jfjs\|3400\| L1\| \| 3\| Jgf Fjf\|2455\| L2\| \| 4\| Ejh Ruyg\|3500\| L3\| \| 5\|Efgw Ygefrf\|5400\| L4\| \| 6\| Fuyg Ruyoi\|4020\| L4\| \| 7\| Gfgd Tefdf\|3500\| L4\| +---+-----------+----+---+ >>> empdf = spark.createDataFrame(emps,['empid','name','salary','location']) >>> empdf.show() +-----+-----------+------+--------+ \|empid\| name\|salary\|location\| +-----+-----------+------+--------+ \| 1\| Avc Ser\| 2300\| L1\| \| 2\| Kghf Jfjs\| 3400\| L1\| \| 3\| Jgf Fjf\| 2455\| L2\| \| 4\| Ejh Ruyg\| 3500\| L3\| \| 5\|Efgw Ygefrf\| 5400\| L4\| \| 6\| Fuyg Ruyoi\| 4020\| L4\| \| 7\| Gfgd Tefdf\| 3500\| L4\| +-----+-----------+------+--------+ >>> empdf.schema StructType(List(StructField(empid,LongType,true),StructField(name,StringType,true),StructField(salary,LongType,true),StructField(location,StringType,true))) >>> locs = sc.parallelize([('L1','Accounts','New York'),('L2','Sales','London'),('L3','Marketing','Dubai'),('L4','RND','Mumbai')]) >>> locdf = spark.createDataFrame(locs,["location_id","department","city"]) >>> locdf.show() +-----------+----------+--------+ \|location_id\|department\| city\| +-----------+----------+--------+ \| L1\| Accounts\|New York\| \| L2\| Sales\| London\| \| L3\| Marketing\| Dubai\| \| L4\| RND\| Mumbai\| +-----------+----------+--------+ >>> empdf.groupby('location').count().show() +--------+-----+ \|location\|count\| +--------+-----+ \| L1\| 2\| \| L4\| 3\| \| L3\| 1\| \| L2\| 1\| +--------+-----+ >>> empdf.join(locdf,empdf.location==locdf.location_id).show() +-----+-----------+------+--------+-----------+----------+--------+ \|empid\| name\|salary\|location\|location_id\|department\| city\| +-----+-----------+------+--------+-----------+----------+--------+ \| 1\| Avc Ser\| 2300\| L1\| L1\| Accounts\|New York\| \| 2\| Kghf Jfjs\| 3400\| L1\| L1\| Accounts\|New York\| \| 5\|Efgw Ygefrf\| 5400\| L4\| L4\| RND\| Mumbai\| \| 6\| Fuyg Ruyoi\| 4020\| L4\| L4\| RND\| Mumbai\| \| 7\| Gfgd Tefdf\| 3500\| L4\| L4\| RND\| Mumbai\| \| 4\| Ejh Ruyg\| 3500\| L3\| L3\| Marketing\| Dubai\| \| 3\| Jgf Fjf\| 2455\| L2\| L2\| Sales\| London\| +-----+-----------+------+--------+-----------+----------+--------+ >>> empdf.sort(F.col('name')).show() +-----+-----------+------+--------+ \|empid\| name\|salary\|location\| +-----+-----------+------+--------+ \| 1\| Avc Ser\| 2300\| L1\| \| 5\|Efgw Ygefrf\| 5400\| L4\| \| 4\| Ejh Ruyg\| 3500\| L3\| \| 6\| Fuyg Ruyoi\| 4020\| L4\| \| 7\| Gfgd Tefdf\| 3500\| L4\| \| 3\| Jgf Fjf\| 2455\| L2\| \| 2\| Kghf Jfjs\| 3400\| L1\| +-----+-----------+------+--------+ >>> empdf.sort('name').show() +-----+-----------+------+--------+ \|empid\| name\|salary\|location\| +-----+-----------+------+--------+ \| 1\| Avc Ser\| 2300\| L1\| \| 5\|Efgw Ygefrf\| 5400\| L4\| \| 4\| Ejh Ruyg\| 3500\| L3\| \| 6\| Fuyg Ruyoi\| 4020\| L4\| \| 7\| Gfgd Tefdf\| 3500\| L4\| \| 3\| Jgf Fjf\| 2455\| L2\| \| 2\| Kghf Jfjs\| 3400\| L1\| +-----+-----------+------+--------+ >>> empdf.sort('name',ascending=True).show() +-----+-----------+------+--------+ \|empid\| name\|salary\|location\| +-----+-----------+------+--------+ \| 1\| Avc Ser\| 2300\| L1\| \| 5\|Efgw Ygefrf\| 5400\| L4\| \| 4\| Ejh Ruyg\| 3500\| L3\| \| 6\| Fuyg Ruyoi\| 4020\| L4\| \| 7\| Gfgd Tefdf\| 3500\| L4\| \| 3\| Jgf Fjf\| 2455\| L2\| \| 2\| Kghf Jfjs\| 3400\| L1\| +-----+-----------+------+--------+ >>> empdf.sort('name',ascending=False).show() +-----+-----------+------+--------+ \|empid\| name\|salary\|location\| +-----+-----------+------+--------+ \| 2\| Kghf Jfjs\| 3400\| L1\| \| 3\| Jgf Fjf\| 2455\| L2\| \| 7\| Gfgd Tefdf\| 3500\| L4\| \| 6\| Fuyg Ruyoi\| 4020\| L4\| \| 4\| Ejh Ruyg\| 3500\| L3\| \| 5\|Efgw Ygefrf\| 5400\| L4\| \| 1\| Avc Ser\| 2300\| L1\| +-----+-----------+------+--------+ >>> empdf.sort('salary',ascending=False).show() +-----+-----------+------+--------+ \|empid\| name\|salary\|location\| +-----+-----------+------+--------+ \| 5\|Efgw Ygefrf\| 5400\| L4\| \| 6\| Fuyg Ruyoi\| 4020\| L4\| \| 4\| Ejh Ruyg\| 3500\| L3\| \| 7\| Gfgd Tefdf\| 3500\| L4\| \| 2\| Kghf Jfjs\| 3400\| L1\| \| 3\| Jgf Fjf\| 2455\| L2\| \| 1\| Avc Ser\| 2300\| L1\| +-----+-----------+------+--------+ >>> empdf.sort('salary',ascending=True).show() +-----+-----------+------+--------+ \|empid\| name\|salary\|location\| +-----+-----------+------+--------+ \| 1\| Avc Ser\| 2300\| L1\| \| 3\| Jgf Fjf\| 2455\| L2\| \| 2\| Kghf Jfjs\| 3400\| L1\| \| 7\| Gfgd Tefdf\| 3500\| L4\| \| 4\| Ejh Ruyg\| 3500\| L3\| \| 6\| Fuyg Ruyoi\| 4020\| L4\| \| 5\|Efgw Ygefrf\| 5400\| L4\| +-----+-----------+------+--------+ >>> empdf.groupBy('location').count().select(F.col('location'),F.col('count').alias('location_count')).show() +--------+--------------+ \|location\|location_count\| +--------+--------------+ \| L1\| 2\| \| L4\| 3\| \| L3\| 1\| \| L2\| 1\| +--------+--------------+ >>> empdf.groupBy('location').count().alias('location_count').explain() == Physical Plan == (2) HashAggregate(keys=[location#24], functions=[count(1)]) +- Exchange hashpartitioning(location#24, 200) +- (1) HashAggregate(keys=[location#24], functions=[partial_count(1)]) +- *(1) Project [location#24] +- Scan ExistingRDD[empid#21L,name#22,salary#23L,location#24] >>> empdf.orderBy("name").show() +-----+-----------+------+--------+ \|empid\| name\|salary\|location\| +-----+-----------+------+--------+ \| 1\| Avc Ser\| 2300\| L1\| \| 5\|Efgw Ygefrf\| 5400\| L4\| \| 4\| Ejh Ruyg\| 3500\| L3\| \| 6\| Fuyg Ruyoi\| 4020\| L4\| \| 7\| Gfgd Tefdf\| 3500\| L4\| \| 3\| Jgf Fjf\| 2455\| L2\| \| 2\| Kghf Jfjs\| 3400\| L1\| +-----+-----------+------+--------+ >>> empdf.orderBy("name").show() +-----+-----------+------+--------+ \|empid\| name\|salary\|location\| +-----+-----------+------+--------+ \| 1\| Avc Ser\| 2300\| L1\| \| 5\|Efgw Ygefrf\| 5400\| L4\| \| 4\| Ejh Ruyg\| 3500\| L3\| \| 6\| Fuyg Ruyoi\| 4020\| L4\| \| 7\| Gfgd Tefdf\| 3500\| L4\| \| 3\| Jgf Fjf\| 2455\| L2\| \| 2\| Kghf Jfjs\| 3400\| L1\| +-----+-----------+------+--------+ >>> empdf.orderBy(F.desc("name")).show() +-----+-----------+------+--------+ \|empid\| name\|salary\|location\| +-----+-----------+------+--------+ \| 2\| Kghf Jfjs\| 3400\| L1\| \| 3\| Jgf Fjf\| 2455\| L2\| \| 7\| Gfgd Tefdf\| 3500\| L4\| \| 6\| Fuyg Ruyoi\| 4020\| L4\| \| 4\| Ejh Ruyg\| 3500\| L3\| \| 5\|Efgw Ygefrf\| 5400\| L4\| \| 1\| Avc Ser\| 2300\| L1\| +-----+-----------+------+--------+ >>> empdf.orderBy("location",F.desc("salary")).show() +-----+-----------+------+--------+ \|empid\| name\|salary\|location\| +-----+-----------+------+--------+ \| 2\| Kghf Jfjs\| 3400\| L1\| \| 1\| Avc Ser\| 2300\| L1\| \| 3\| Jgf Fjf\| 2455\| L2\| \| 4\| Ejh Ruyg\| 3500\| L3\| \| 5\|Efgw Ygefrf\| 5400\| L4\| \| 6\| Fuyg Ruyoi\| 4020\| L4\| \| 7\| Gfgd Tefdf\| 3500\| L4\| +-----+-----------+------+--------+ >>> empdf.orderBy(F.desc("location"),"salary").show() +-----+-----------+------+--------+ \|empid\| name\|salary\|location\| +-----+-----------+------+--------+ \| 7\| Gfgd Tefdf\| 3500\| L4\| \| 6\| Fuyg Ruyoi\| 4020\| L4\| \| 5\|Efgw Ygefrf\| 5400\| L4\| \| 4\| Ejh Ruyg\| 3500\| L3\| \| 3\| Jgf Fjf\| 2455\| L2\| \| 1\| Avc Ser\| 2300\| L1\| \| 2\| Kghf Jfjs\| 3400\| L1\| +-----+-----------+------+--------+ >>> empdf.select("empid","location").show() +-----+--------+ \|empid\|location\| +-----+--------+ \| 1\| L1\| \| 2\| L1\| \| 3\| L2\| \| 4\| L3\| \| 5\| L4\| \| 6\| L4\| \| 7\| L4\| +-----+--------+ >>> empdf.select("empid","location").where("salary > 3000").show() +-----+--------+ \|empid\|location\| +-----+--------+ \| 2\| L1\| \| 4\| L3\| \| 5\| L4\| \| 6\| L4\| \| 7\| L4\| +-----+--------+ >>> empdf.selectExpr("salary > 4000").show() +---------------+ \|(salary > 4000)\| +---------------+ \| false\| \| false\| \| false\| \| false\| \| true\| \| true\| \| false\| +---------------+ >>> empdf.selectExpr("empid","salary > 4000").show() +-----+---------------+ \|empid\|(salary > 4000)\| +-----+---------------+ \| 1\| false\| \| 2\| false\| \| 3\| false\| \| 4\| false\| \| 5\| true\| \| 6\| true\| \| 7\| false\| +-----+---------------+ >> empdf.withColumn("high_salary", F.when(F.col('salary') > 4000,'yes').otherwise('No')).show() +-----+-----------+------+--------+-----------+ \|empid\| name\|salary\|location\|high_salary\| +-----+-----------+------+--------+-----------+ \| 1\| Avc Ser\| 2300\| L1\| No\| \| 2\| Kghf Jfjs\| 3400\| L1\| No\| \| 3\| Jgf Fjf\| 2455\| L2\| No\| \| 4\| Ejh Ruyg\| 3500\| L3\| No\| \| 5\|Efgw Ygefrf\| 5400\| L4\| yes\| \| 6\| Fuyg Ruyoi\| 4020\| L4\| yes\| \| 7\| Gfgd Tefdf\| 3500\| L4\| No\| +-----+-----------+------+--------+-----------+ >>> empdf = empdf.withColumn('high_salary', F.when(F.col('salary') > 4000,'yes').otherwise('No')).withColumn('hq_high_salary',F.when((F.col('location') == 'L1') & (F.col('salary') > 3000),'yes').otherwise('no')) +-----+-----------+------+--------+-----------+--------------+ \|empid\| name\|salary\|location\|high_salary\|hq_high_salary\| +-----+-----------+------+--------+-----------+--------------+ \| 1\| Avc Ser\| 2300\| L1\| No\| no\| \| 2\| Kghf Jfjs\| 3400\| L1\| No\| yes\| \| 3\| Jgf Fjf\| 2455\| L2\| No\| no\| \| 4\| Ejh Ruyg\| 3500\| L3\| No\| no\| \| 5\|Efgw Ygefrf\| 5400\| L4\| yes\| no\| \| 6\| Fuyg Ruyoi\| 4020\| L4\| yes\| no\| \| 7\| Gfgd Tefdf\| 3500\| L4\| No\| no\| +-----+-----------+------+--------+-----------+--------------+ >>> empdf.withColumnRenamed('hq_high_salary','head_quarters_high_sal').show() +-----+-----------+------+--------+-----------+---------------------+ \|empid\| name\|salary\|location\|high_salary\|head_quarters_high_sal\| +-----+-----------+------+--------+-----------+---------------------+ \| 1\| Avc Ser\| 2300\| L1\| No\| no\| \| 2\| Kghf Jfjs\| 3400\| L1\| No\| yes\| \| 3\| Jgf Fjf\| 2455\| L2\| No\| no\| \| 4\| Ejh Ruyg\| 3500\| L3\| No\| no\| \| 5\|Efgw Ygefrf\| 5400\| L4\| yes\| no\| \| 6\| Fuyg Ruyoi\| 4020\| L4\| yes\| no\| \| 7\| Gfgd Tefdf\| 3500\| L4\| No\| no\| +-----+-----------+------+--------+-----------+---------------------+
DataFrame Joins >>> emps = sc.parallelize([(1,'Avc Ser',2300,'L1'),(2,'Kghf Jfjs',3400,'L2'),(3,'Jgf Fjf',2455,'L2'),(4,'Ejh Ruyg',3500,''),(5,'Efgw Ygefrf',5400,''),(6,'Fuyg Ruyoi',4020,'L5'),(7,"Gfgd Tefdf",3500,'L4')]) >>> empdf = spark.createDataFrame(emps,['empid','name','salary','location']) >>> empdf.show() total rows - 7 +-----+-----------+------+--------+ \|empid\| name\|salary\|location\| +-----+-----------+------+--------+ \| 1\| Avc Ser\| 2300\| L1\| \| 2\| Kghf Jfjs\| 3400\| L2\| \| 3\| Jgf Fjf\| 2455\| L2\| \| 4\| Ejh Ruyg\| 3500\| \| \| 5\|Efgw Ygefrf\| 5400\| \| \| 6\| Fuyg Ruyoi\| 4020\| L5\| \| 7\| Gfgd Tefdf\| 3500\| L4\| +-----+-----------+------+--------+ >>> locs = sc.parallelize([('L1','Accounts','New York'),('L2','Sales','London'),('L3','Marketing','Dubai'),('L4','RND','Mumbai'),('L5','Finance','Singapore'),('L6','Manufacturing','Beijing')]) >>> locdf = spark.createDataFrame(locs,["location_id","department","city"]) >>> locdf.show() total rows - 6 +-----------+-------------+---------+ \|location_id\| department\| city\| +-----------+-------------+---------+ \| L1\| Accounts\| New York\| \| L2\| Sales\| London\| \| L3\| Marketing\| Dubai\| \| L4\| RND\| Mumbai\| \| L5\| Finance\|Singapore\| \| L6\|Manufacturing\| Beijing\| +-----------+-------------+---------+ >>> empdf.crossJoin(locdf).count() 42 >>> empdf.crossJoin(locdf).show(empdf.count() * locdf.count()) +-----+-----------+------+--------+-----------+-------------+---------+ \|empid\| name\|salary\|location\|location_id\| department\| city\| +-----+-----------+------+--------+-----------+-------------+---------+ \| 1\| Avc Ser\| 2300\| L1\| L1\| Accounts\| New York\| \| 1\| Avc Ser\| 2300\| L1\| L2\| Sales\| London\| \| 1\| Avc Ser\| 2300\| L1\| L3\| Marketing\| Dubai\| \| 1\| Avc Ser\| 2300\| L1\| L4\| RND\| Mumbai\| \| 1\| Avc Ser\| 2300\| L1\| L5\| Finance\|Singapore\| \| 1\| Avc Ser\| 2300\| L1\| L6\|Manufacturing\| Beijing\| \| 2\| Kghf Jfjs\| 3400\| L2\| L1\| Accounts\| New York\| \| 2\| Kghf Jfjs\| 3400\| L2\| L2\| Sales\| London\| \| 2\| Kghf Jfjs\| 3400\| L2\| L3\| Marketing\| Dubai\| \| 2\| Kghf Jfjs\| 3400\| L2\| L4\| RND\| Mumbai\| \| 2\| Kghf Jfjs\| 3400\| L2\| L5\| Finance\|Singapore\| \| 2\| Kghf Jfjs\| 3400\| L2\| L6\|Manufacturing\| Beijing\| \| 3\| Jgf Fjf\| 2455\| L2\| L1\| Accounts\| New York\| \| 3\| Jgf Fjf\| 2455\| L2\| L2\| Sales\| London\| \| 3\| Jgf Fjf\| 2455\| L2\| L3\| Marketing\| Dubai\| \| 3\| Jgf Fjf\| 2455\| L2\| L4\| RND\| Mumbai\| \| 3\| Jgf Fjf\| 2455\| L2\| L5\| Finance\|Singapore\| \| 3\| Jgf Fjf\| 2455\| L2\| L6\|Manufacturing\| Beijing\| \| 4\| Ejh Ruyg\| 3500\| \| L1\| Accounts\| New York\| \| 4\| Ejh Ruyg\| 3500\| \| L2\| Sales\| London\| \| 4\| Ejh Ruyg\| 3500\| \| L3\| Marketing\| Dubai\| \| 4\| Ejh Ruyg\| 3500\| \| L4\| RND\| Mumbai\| \| 4\| Ejh Ruyg\| 3500\| \| L5\| Finance\|Singapore\| \| 4\| Ejh Ruyg\| 3500\| \| L6\|Manufacturing\| Beijing\| \| 5\|Efgw Ygefrf\| 5400\| \| L1\| Accounts\| New York\| \| 5\|Efgw Ygefrf\| 5400\| \| L2\| Sales\| London\| \| 5\|Efgw Ygefrf\| 5400\| \| L3\| Marketing\| Dubai\| \| 5\|Efgw Ygefrf\| 5400\| \| L4\| RND\| Mumbai\| \| 5\|Efgw Ygefrf\| 5400\| \| L5\| Finance\|Singapore\| \| 5\|Efgw Ygefrf\| 5400\| \| L6\|Manufacturing\| Beijing\| \| 6\| Fuyg Ruyoi\| 4020\| L5\| L1\| Accounts\| New York\| \| 6\| Fuyg Ruyoi\| 4020\| L5\| L2\| Sales\| London\| \| 6\| Fuyg Ruyoi\| 4020\| L5\| L3\| Marketing\| Dubai\| \| 6\| Fuyg Ruyoi\| 4020\| L5\| L4\| RND\| Mumbai\| \| 6\| Fuyg Ruyoi\| 4020\| L5\| L5\| Finance\|Singapore\| \| 6\| Fuyg Ruyoi\| 4020\| L5\| L6\|Manufacturing\| Beijing\| \| 7\| Gfgd Tefdf\| 3500\| L4\| L1\| Accounts\| New York\| \| 7\| Gfgd Tefdf\| 3500\| L4\| L2\| Sales\| London\| \| 7\| Gfgd Tefdf\| 3500\| L4\| L3\| Marketing\| Dubai\| \| 7\| Gfgd Tefdf\| 3500\| L4\| L4\| RND\| Mumbai\| \| 7\| Gfgd Tefdf\| 3500\| L4\| L5\| Finance\|Singapore\| \| 7\| Gfgd Tefdf\| 3500\| L4\| L6\|Manufacturing\| Beijing\| +-----+-----------+------+--------+-----------+-------------+---------+ >>> empdf.join(locdf,empdf.location == locdf.location_id, "inner").show() +-----+----------+------+--------+-----------+----------+---------+ \|empid\| name\|salary\|location\|location_id\|department\| city\| +-----+----------+------+--------+-----------+----------+---------+ \| 1\| Avc Ser\| 2300\| L1\| L1\| Accounts\| New York\| \| 7\|Gfgd Tefdf\| 3500\| L4\| L4\| RND\| Mumbai\| \| 2\| Kghf Jfjs\| 3400\| L2\| L2\| Sales\| London\| \| 3\| Jgf Fjf\| 2455\| L2\| L2\| Sales\| London\| \| 6\|Fuyg Ruyoi\| 4020\| L5\| L5\| Finance\|Singapore\| +-----+----------+------+--------+-----------+----------+---------+ left or leftouter >>> empdf.join(locdf,empdf.location == locdf.location_id, "left").show() +-----+-----------+------+--------+-----------+----------+---------+ \|empid\| name\|salary\|location\|location_id\|department\| city\| +-----+-----------+------+--------+-----------+----------+---------+ \| 4\| Ejh Ruyg\| 3500\| \| null\| null\| null\| \| 5\|Efgw Ygefrf\| 5400\| \| null\| null\| null\| \| 1\| Avc Ser\| 2300\| L1\| L1\| Accounts\| New York\| \| 7\| Gfgd Tefdf\| 3500\| L4\| L4\| RND\| Mumbai\| \| 2\| Kghf Jfjs\| 3400\| L2\| L2\| Sales\| London\| \| 3\| Jgf Fjf\| 2455\| L2\| L2\| Sales\| London\| \| 6\| Fuyg Ruyoi\| 4020\| L5\| L5\| Finance\|Singapore\| +-----+-----------+------+--------+-----------+----------+---------+ right or rightouter >>> empdf.join(locdf,empdf.location == locdf.location_id, "right").show() +-----+----------+------+--------+-----------+-------------+---------+ \|empid\| name\|salary\|location\|location_id\| department\| city\| +-----+----------+------+--------+-----------+-------------+---------+ \| null\| null\| null\| null\| L6\|Manufacturing\| Beijing\| \| 1\| Avc Ser\| 2300\| L1\| L1\| Accounts\| New York\| \| 7\|Gfgd Tefdf\| 3500\| L4\| L4\| RND\| Mumbai\| \| null\| null\| null\| null\| L3\| Marketing\| Dubai\| \| 2\| Kghf Jfjs\| 3400\| L2\| L2\| Sales\| London\| \| 3\| Jgf Fjf\| 2455\| L2\| L2\| Sales\| London\| \| 6\|Fuyg Ruyoi\| 4020\| L5\| L5\| Finance\|Singapore\| +-----+----------+------+--------+-----------+-------------+---------+ full or fullouter or outer or full_outer >>> empdf.join(locdf,empdf.location == locdf.location_id, "full").show() +-----+-----------+------+--------+-----------+-------------+---------+ \|empid\| name\|salary\|location\|location_id\| department\| city\| +-----+-----------+------+--------+-----------+-------------+---------+ \| null\| null\| null\| null\| L6\|Manufacturing\| Beijing\| \| 4\| Ejh Ruyg\| 3500\| \| null\| null\| null\| \| 5\|Efgw Ygefrf\| 5400\| \| null\| null\| null\| \| 1\| Avc Ser\| 2300\| L1\| L1\| Accounts\| New York\| \| 7\| Gfgd Tefdf\| 3500\| L4\| L4\| RND\| Mumbai\| \| null\| null\| null\| null\| L3\| Marketing\| Dubai\| \| 2\| Kghf Jfjs\| 3400\| L2\| L2\| Sales\| London\| \| 3\| Jgf Fjf\| 2455\| L2\| L2\| Sales\| London\| \| 6\| Fuyg Ruyoi\| 4020\| L5\| L5\| Finance\|Singapore\| +-----+-----------+------+--------+-----------+-------------+---------+
Compiled on FRIDAY, 23-AUGUST-2024, 05:07:50 PM IST

Comments