|
DataFrames are distributed collection of data which is mapped to named columns. Uses Spark SQL Optimizer engine for better processing of Data. DataFrame can be created 1. From RDD 2. From Data files 3. From Spark Session
|
1. Creating DataFrame from RDD
>>> empnamesrdd = sc.parallelize(['Sally Bowen','Larry Becton','Rose Angela','Michael Todd','Joe Kimberly','Frank Patrick', 'Bonnie Lisa'])
>>> from pyspark.sql import Row
>>> empnamesrdd.map(lambda d: Row(d)).toDF()
DataFrame[_1: string]
Without specifying schema
>>> empnamesrdd.map(lambda d: Row(d)).toDF().show()
+-------------+
| _1|
+-------------+
| Sally Bowen|
| Larry Becton|
| Rose Angela|
| Michael Todd|
| Joe Kimberly|
|Frank Patrick|
| Bonnie Lisa|
+-------------+
>>> empnamesrdd.map(lambda d: Row(d)).toDF(['names']).show()
+-------------+
| names|
+-------------+
| Sally Bowen|
| Larry Becton|
| Rose Angela|
| Michael Todd|
| Joe Kimberly|
|Frank Patrick|
| Bonnie Lisa|
+-------------+
>>> stockscsv = spark.read.csv('file:///home/mkm/data/stocks_data.csv')
>>> stockscsv.show()
+------------+----------+------+------+------+----------+
| _c0| _c1| _c2| _c3| _c4| _c5|
+------------+----------+------+------+------+----------+
|stock_symbol|prev_close| open| high| low| date|
| FT| 255.01|828.04|361.58|841.53|2022-01-01|
| COHR| 445.76|813.11|937.33|810.12|2022-01-01|
| CHH| 428.5|928.85|929.36|215.53|2022-01-01|
| GD| 439.49|574.96|976.55|772.67|2022-01-01|
| PSAT| 276.16|681.53|328.67|872.12|2022-01-01|
+------------+----------+------+------+------+----------+
>>> stockscsv.schema
StructType(List(StructField(_c0,StringType,true),StructField(_c1,StringType,true),StructField(_c2,StringType,true),StructField(_c3,StringType,true),StructField(_c4,StringType,true),StructField(_c5,StringType,true)))
>>> stockscsv = spark.read.csv('file:///home/mkm/data/stocks_data.csv', inferSchema=True, header=True)
>>> stockscsv.schema
StructType(List(StructField(stock_symbol,StringType,true),StructField(prev_close,DoubleType,true),StructField(open,DoubleType,true),StructField(high,DoubleType,true),StructField(low,DoubleType,true),StructField(date,TimestampType,true)))
>>> stockscsv = spark.read.csv('file:///home/mkm/data/stocks_data.csv', inferSchema=False, header=True)
>>> stockscsv.schema
StructType(List(StructField(stock_symbol,StringType,true),StructField(prev_close,StringType,true),StructField(open,StringType,true),StructField(high,StringType,true),StructField(low,StringType,true),StructField(date,StringType,true)))
Make sure mysql connector jar on classpath or in SPARK_HOME/lib directory
>>> sampledb_df = spark.read.jdbc('jdbc:mysql://localhost:3306/sampledb','sampletb',properties={'user':'root','password':'root'})
>>> sampledb_df.show()
+------+
|number|
+------+
| 1|
| 2|
| 3|
| 4|
| 5|
+------+
>>> empsaldata=[(123,3434,'USD'),(456,43000,'INR'),(789,42353,'EUR'),(637,6500,'SGD'),(345,6000,'AED')]
>>> empsaldf = spark.createDataFrame(empsaldata)
>>> empsaldf.show()
+---+-----+---+
| _1| _2| _3|
+---+-----+---+
|123| 3434|USD|
|456|43000|INR|
|789|42353|EUR|
|637| 6500|SGD|
|345| 6000|AED|
+---+-----+---+
>>> empsaldf = spark.createDataFrame(empsaldata,['id','salary','currency'])
>>> empsaldf.show()
+---+------+--------+
| id|salary|currency|
+---+------+--------+
|123| 3434| USD|
|456| 43000| INR|
|789| 42353| EUR|
|637| 6500| SGD|
|345| 6000| AED|
+---+------+--------+
>>> from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType
>>> empsalschema = StructType([StructField('id',IntegerType(),False),
... StructField('salary',LongType(),False),
... StructField('currency',StringType(),False)])
>>> empsaldf = spark.createDataFrame(empsaldata,empsalschema)
>>> empsaldf.show()
+---+------+--------+
| id|salary|currency|
+---+------+--------+
|123| 3434| USD|
|456| 43000| INR|
|789| 42353| EUR|
|637| 6500| SGD|
|345| 6000| AED|
+---+------+--------+
>>> empsaldf.schema
StructType(List(StructField(id,IntegerType,false),StructField(salary,LongType,false),StructField(currency,StringType,false)))
>>> empsaldf.col
empsaldf.colRegex( empsaldf.collect( empsaldf.columns
>>> empsaldf.columns
['id', 'salary', 'currency']
>>> for col in empsaldf.columns:
... print(col)
...
id
salary
currency
>>> empsalnewcols = [('es_'+col) for col in empsaldf.columns]
>>> empsalnewcols
['es_id', 'es_salary', 'es_currency']
>>> empsaldf.schema
StructType(List(StructField(id,IntegerType,false),StructField(salary,LongType,false),StructField(currency,StringType,false)))
>>> empsaloldcols = empsaldf.columns
>>> empsaloldcols
['id', 'salary', 'currency']
>>> empsalnewcols
['es_id', 'es_salary', 'es_currency']
>>> for i in range(len(empsaloldcols)):
... print(empsaloldcols[i])
...
id
salary
currency
>>> for i in range(len(empsaloldcols)):
... empsaldf = empsaldf.withColumnRenamed(empsaloldcols[i],empsalnewcols[i])
...
>>> empsaldf.schema
StructType(List(StructField(es_id,IntegerType,false),StructField(es_salary,LongType,false),StructField(es_currency,StringType,false)))
|
>>> emps = sc.parallelize([(1,'Avc Ser',2300,'L1'),(2,'Kghf Jfjs',3400,'L1'),(3,'Jgf Fjf',2455,'L2'),(4,'Ejh Ruyg',3500,'L3'),(5,'Efgw Ygefrf',5400,'L4'),(6,'Fuyg Ruyoi',4020,'L4'),(7,"Gfgd Tefdf",3500,'L4')])
>>> empdf = spark.createDataFrame(emps)
>>> empdf.show()
+---+-----------+----+---+
| _1| _2| _3| _4|
+---+-----------+----+---+
| 1| Avc Ser|2300| L1|
| 2| Kghf Jfjs|3400| L1|
| 3| Jgf Fjf|2455| L2|
| 4| Ejh Ruyg|3500| L3|
| 5|Efgw Ygefrf|5400| L4|
| 6| Fuyg Ruyoi|4020| L4|
| 7| Gfgd Tefdf|3500| L4|
+---+-----------+----+---+
>>> empdf = spark.createDataFrame(emps,['empid','name','salary','location'])
>>> empdf.show()
+-----+-----------+------+--------+
|empid| name|salary|location|
+-----+-----------+------+--------+
| 1| Avc Ser| 2300| L1|
| 2| Kghf Jfjs| 3400| L1|
| 3| Jgf Fjf| 2455| L2|
| 4| Ejh Ruyg| 3500| L3|
| 5|Efgw Ygefrf| 5400| L4|
| 6| Fuyg Ruyoi| 4020| L4|
| 7| Gfgd Tefdf| 3500| L4|
+-----+-----------+------+--------+
>>> empdf.schema
StructType(List(StructField(empid,LongType,true),StructField(name,StringType,true),StructField(salary,LongType,true),StructField(location,StringType,true)))
>>> locs = sc.parallelize([('L1','Accounts','New York'),('L2','Sales','London'),('L3','Marketing','Dubai'),('L4','RND','Mumbai')])
>>> locdf = spark.createDataFrame(locs,["location_id","department","city"])
>>> locdf.show()
+-----------+----------+--------+
|location_id|department| city|
+-----------+----------+--------+
| L1| Accounts|New York|
| L2| Sales| London|
| L3| Marketing| Dubai|
| L4| RND| Mumbai|
+-----------+----------+--------+
>>> empdf.groupby('location').count().show()
+--------+-----+
|location|count|
+--------+-----+
| L1| 2|
| L4| 3|
| L3| 1|
| L2| 1|
+--------+-----+
>>> empdf.join(locdf,empdf.location==locdf.location_id).show()
+-----+-----------+------+--------+-----------+----------+--------+
|empid| name|salary|location|location_id|department| city|
+-----+-----------+------+--------+-----------+----------+--------+
| 1| Avc Ser| 2300| L1| L1| Accounts|New York|
| 2| Kghf Jfjs| 3400| L1| L1| Accounts|New York|
| 5|Efgw Ygefrf| 5400| L4| L4| RND| Mumbai|
| 6| Fuyg Ruyoi| 4020| L4| L4| RND| Mumbai|
| 7| Gfgd Tefdf| 3500| L4| L4| RND| Mumbai|
| 4| Ejh Ruyg| 3500| L3| L3| Marketing| Dubai|
| 3| Jgf Fjf| 2455| L2| L2| Sales| London|
+-----+-----------+------+--------+-----------+----------+--------+
>>> empdf.sort(F.col('name')).show()
+-----+-----------+------+--------+
|empid| name|salary|location|
+-----+-----------+------+--------+
| 1| Avc Ser| 2300| L1|
| 5|Efgw Ygefrf| 5400| L4|
| 4| Ejh Ruyg| 3500| L3|
| 6| Fuyg Ruyoi| 4020| L4|
| 7| Gfgd Tefdf| 3500| L4|
| 3| Jgf Fjf| 2455| L2|
| 2| Kghf Jfjs| 3400| L1|
+-----+-----------+------+--------+
>>> empdf.sort('name').show()
+-----+-----------+------+--------+
|empid| name|salary|location|
+-----+-----------+------+--------+
| 1| Avc Ser| 2300| L1|
| 5|Efgw Ygefrf| 5400| L4|
| 4| Ejh Ruyg| 3500| L3|
| 6| Fuyg Ruyoi| 4020| L4|
| 7| Gfgd Tefdf| 3500| L4|
| 3| Jgf Fjf| 2455| L2|
| 2| Kghf Jfjs| 3400| L1|
+-----+-----------+------+--------+
>>> empdf.sort('name',ascending=True).show()
+-----+-----------+------+--------+
|empid| name|salary|location|
+-----+-----------+------+--------+
| 1| Avc Ser| 2300| L1|
| 5|Efgw Ygefrf| 5400| L4|
| 4| Ejh Ruyg| 3500| L3|
| 6| Fuyg Ruyoi| 4020| L4|
| 7| Gfgd Tefdf| 3500| L4|
| 3| Jgf Fjf| 2455| L2|
| 2| Kghf Jfjs| 3400| L1|
+-----+-----------+------+--------+
>>> empdf.sort('name',ascending=False).show()
+-----+-----------+------+--------+
|empid| name|salary|location|
+-----+-----------+------+--------+
| 2| Kghf Jfjs| 3400| L1|
| 3| Jgf Fjf| 2455| L2|
| 7| Gfgd Tefdf| 3500| L4|
| 6| Fuyg Ruyoi| 4020| L4|
| 4| Ejh Ruyg| 3500| L3|
| 5|Efgw Ygefrf| 5400| L4|
| 1| Avc Ser| 2300| L1|
+-----+-----------+------+--------+
>>> empdf.sort('salary',ascending=False).show()
+-----+-----------+------+--------+
|empid| name|salary|location|
+-----+-----------+------+--------+
| 5|Efgw Ygefrf| 5400| L4|
| 6| Fuyg Ruyoi| 4020| L4|
| 4| Ejh Ruyg| 3500| L3|
| 7| Gfgd Tefdf| 3500| L4|
| 2| Kghf Jfjs| 3400| L1|
| 3| Jgf Fjf| 2455| L2|
| 1| Avc Ser| 2300| L1|
+-----+-----------+------+--------+
>>> empdf.sort('salary',ascending=True).show()
+-----+-----------+------+--------+
|empid| name|salary|location|
+-----+-----------+------+--------+
| 1| Avc Ser| 2300| L1|
| 3| Jgf Fjf| 2455| L2|
| 2| Kghf Jfjs| 3400| L1|
| 7| Gfgd Tefdf| 3500| L4|
| 4| Ejh Ruyg| 3500| L3|
| 6| Fuyg Ruyoi| 4020| L4|
| 5|Efgw Ygefrf| 5400| L4|
+-----+-----------+------+--------+
>>> empdf.groupBy('location').count().select(F.col('location'),F.col('count').alias('location_count')).show()
+--------+--------------+
|location|location_count|
+--------+--------------+
| L1| 2|
| L4| 3|
| L3| 1|
| L2| 1|
+--------+--------------+
>>> empdf.groupBy('location').count().alias('location_count').explain()
== Physical Plan ==
*(2) HashAggregate(keys=[location#24], functions=[count(1)])
+- Exchange hashpartitioning(location#24, 200)
+- *(1) HashAggregate(keys=[location#24], functions=[partial_count(1)])
+- *(1) Project [location#24]
+- Scan ExistingRDD[empid#21L,name#22,salary#23L,location#24]
>>> empdf.orderBy("name").show()
+-----+-----------+------+--------+
|empid| name|salary|location|
+-----+-----------+------+--------+
| 1| Avc Ser| 2300| L1|
| 5|Efgw Ygefrf| 5400| L4|
| 4| Ejh Ruyg| 3500| L3|
| 6| Fuyg Ruyoi| 4020| L4|
| 7| Gfgd Tefdf| 3500| L4|
| 3| Jgf Fjf| 2455| L2|
| 2| Kghf Jfjs| 3400| L1|
+-----+-----------+------+--------+
>>> empdf.orderBy("name").show()
+-----+-----------+------+--------+
|empid| name|salary|location|
+-----+-----------+------+--------+
| 1| Avc Ser| 2300| L1|
| 5|Efgw Ygefrf| 5400| L4|
| 4| Ejh Ruyg| 3500| L3|
| 6| Fuyg Ruyoi| 4020| L4|
| 7| Gfgd Tefdf| 3500| L4|
| 3| Jgf Fjf| 2455| L2|
| 2| Kghf Jfjs| 3400| L1|
+-----+-----------+------+--------+
>>> empdf.orderBy(F.desc("name")).show()
+-----+-----------+------+--------+
|empid| name|salary|location|
+-----+-----------+------+--------+
| 2| Kghf Jfjs| 3400| L1|
| 3| Jgf Fjf| 2455| L2|
| 7| Gfgd Tefdf| 3500| L4|
| 6| Fuyg Ruyoi| 4020| L4|
| 4| Ejh Ruyg| 3500| L3|
| 5|Efgw Ygefrf| 5400| L4|
| 1| Avc Ser| 2300| L1|
+-----+-----------+------+--------+
>>> empdf.orderBy("location",F.desc("salary")).show()
+-----+-----------+------+--------+
|empid| name|salary|location|
+-----+-----------+------+--------+
| 2| Kghf Jfjs| 3400| L1|
| 1| Avc Ser| 2300| L1|
| 3| Jgf Fjf| 2455| L2|
| 4| Ejh Ruyg| 3500| L3|
| 5|Efgw Ygefrf| 5400| L4|
| 6| Fuyg Ruyoi| 4020| L4|
| 7| Gfgd Tefdf| 3500| L4|
+-----+-----------+------+--------+
>>> empdf.orderBy(F.desc("location"),"salary").show()
+-----+-----------+------+--------+
|empid| name|salary|location|
+-----+-----------+------+--------+
| 7| Gfgd Tefdf| 3500| L4|
| 6| Fuyg Ruyoi| 4020| L4|
| 5|Efgw Ygefrf| 5400| L4|
| 4| Ejh Ruyg| 3500| L3|
| 3| Jgf Fjf| 2455| L2|
| 1| Avc Ser| 2300| L1|
| 2| Kghf Jfjs| 3400| L1|
+-----+-----------+------+--------+
>>> empdf.select("empid","location").show()
+-----+--------+
|empid|location|
+-----+--------+
| 1| L1|
| 2| L1|
| 3| L2|
| 4| L3|
| 5| L4|
| 6| L4|
| 7| L4|
+-----+--------+
>>> empdf.select("empid","location").where("salary > 3000").show()
+-----+--------+
|empid|location|
+-----+--------+
| 2| L1|
| 4| L3|
| 5| L4|
| 6| L4|
| 7| L4|
+-----+--------+
>>> empdf.selectExpr("salary > 4000").show()
+---------------+
|(salary > 4000)|
+---------------+
| false|
| false|
| false|
| false|
| true|
| true|
| false|
+---------------+
>>> empdf.selectExpr("empid","salary > 4000").show()
+-----+---------------+
|empid|(salary > 4000)|
+-----+---------------+
| 1| false|
| 2| false|
| 3| false|
| 4| false|
| 5| true|
| 6| true|
| 7| false|
+-----+---------------+
>> empdf.withColumn("high_salary", F.when(F.col('salary') > 4000,'yes').otherwise('No')).show()
+-----+-----------+------+--------+-----------+
|empid| name|salary|location|high_salary|
+-----+-----------+------+--------+-----------+
| 1| Avc Ser| 2300| L1| No|
| 2| Kghf Jfjs| 3400| L1| No|
| 3| Jgf Fjf| 2455| L2| No|
| 4| Ejh Ruyg| 3500| L3| No|
| 5|Efgw Ygefrf| 5400| L4| yes|
| 6| Fuyg Ruyoi| 4020| L4| yes|
| 7| Gfgd Tefdf| 3500| L4| No|
+-----+-----------+------+--------+-----------+
>>> empdf = empdf.withColumn('high_salary', F.when(F.col('salary') > 4000,'yes').otherwise('No')).withColumn('hq_high_salary',F.when((F.col('location') == 'L1') & (F.col('salary') > 3000),'yes').otherwise('no'))
+-----+-----------+------+--------+-----------+--------------+
|empid| name|salary|location|high_salary|hq_high_salary|
+-----+-----------+------+--------+-----------+--------------+
| 1| Avc Ser| 2300| L1| No| no|
| 2| Kghf Jfjs| 3400| L1| No| yes|
| 3| Jgf Fjf| 2455| L2| No| no|
| 4| Ejh Ruyg| 3500| L3| No| no|
| 5|Efgw Ygefrf| 5400| L4| yes| no|
| 6| Fuyg Ruyoi| 4020| L4| yes| no|
| 7| Gfgd Tefdf| 3500| L4| No| no|
+-----+-----------+------+--------+-----------+--------------+
>>> empdf.withColumnRenamed('hq_high_salary','head_quarters_high_sal').show()
+-----+-----------+------+--------+-----------+---------------------+
|empid| name|salary|location|high_salary|head_quarters_high_sal|
+-----+-----------+------+--------+-----------+---------------------+
| 1| Avc Ser| 2300| L1| No| no|
| 2| Kghf Jfjs| 3400| L1| No| yes|
| 3| Jgf Fjf| 2455| L2| No| no|
| 4| Ejh Ruyg| 3500| L3| No| no|
| 5|Efgw Ygefrf| 5400| L4| yes| no|
| 6| Fuyg Ruyoi| 4020| L4| yes| no|
| 7| Gfgd Tefdf| 3500| L4| No| no|
+-----+-----------+------+--------+-----------+---------------------+
|
>>> emps = sc.parallelize([(1,'Avc Ser',2300,'L1'),(2,'Kghf Jfjs',3400,'L2'),(3,'Jgf Fjf',2455,'L2'),(4,'Ejh Ruyg',3500,''),(5,'Efgw Ygefrf',5400,''),(6,'Fuyg Ruyoi',4020,'L5'),(7,"Gfgd Tefdf",3500,'L4')])
>>> empdf = spark.createDataFrame(emps,['empid','name','salary','location'])
>>> empdf.show() total rows - 7
+-----+-----------+------+--------+
|empid| name|salary|location|
+-----+-----------+------+--------+
| 1| Avc Ser| 2300| L1|
| 2| Kghf Jfjs| 3400| L2|
| 3| Jgf Fjf| 2455| L2|
| 4| Ejh Ruyg| 3500| |
| 5|Efgw Ygefrf| 5400| |
| 6| Fuyg Ruyoi| 4020| L5|
| 7| Gfgd Tefdf| 3500| L4|
+-----+-----------+------+--------+
>>> locs = sc.parallelize([('L1','Accounts','New York'),('L2','Sales','London'),('L3','Marketing','Dubai'),('L4','RND','Mumbai'),('L5','Finance','Singapore'),('L6','Manufacturing','Beijing')])
>>> locdf = spark.createDataFrame(locs,["location_id","department","city"])
>>> locdf.show() total rows - 6
+-----------+-------------+---------+
|location_id| department| city|
+-----------+-------------+---------+
| L1| Accounts| New York|
| L2| Sales| London|
| L3| Marketing| Dubai|
| L4| RND| Mumbai|
| L5| Finance|Singapore|
| L6|Manufacturing| Beijing|
+-----------+-------------+---------+
>>> empdf.crossJoin(locdf).count()
42
>>> empdf.crossJoin(locdf).show(empdf.count() * locdf.count())
+-----+-----------+------+--------+-----------+-------------+---------+
|empid| name|salary|location|location_id| department| city|
+-----+-----------+------+--------+-----------+-------------+---------+
| 1| Avc Ser| 2300| L1| L1| Accounts| New York|
| 1| Avc Ser| 2300| L1| L2| Sales| London|
| 1| Avc Ser| 2300| L1| L3| Marketing| Dubai|
| 1| Avc Ser| 2300| L1| L4| RND| Mumbai|
| 1| Avc Ser| 2300| L1| L5| Finance|Singapore|
| 1| Avc Ser| 2300| L1| L6|Manufacturing| Beijing|
| 2| Kghf Jfjs| 3400| L2| L1| Accounts| New York|
| 2| Kghf Jfjs| 3400| L2| L2| Sales| London|
| 2| Kghf Jfjs| 3400| L2| L3| Marketing| Dubai|
| 2| Kghf Jfjs| 3400| L2| L4| RND| Mumbai|
| 2| Kghf Jfjs| 3400| L2| L5| Finance|Singapore|
| 2| Kghf Jfjs| 3400| L2| L6|Manufacturing| Beijing|
| 3| Jgf Fjf| 2455| L2| L1| Accounts| New York|
| 3| Jgf Fjf| 2455| L2| L2| Sales| London|
| 3| Jgf Fjf| 2455| L2| L3| Marketing| Dubai|
| 3| Jgf Fjf| 2455| L2| L4| RND| Mumbai|
| 3| Jgf Fjf| 2455| L2| L5| Finance|Singapore|
| 3| Jgf Fjf| 2455| L2| L6|Manufacturing| Beijing|
| 4| Ejh Ruyg| 3500| | L1| Accounts| New York|
| 4| Ejh Ruyg| 3500| | L2| Sales| London|
| 4| Ejh Ruyg| 3500| | L3| Marketing| Dubai|
| 4| Ejh Ruyg| 3500| | L4| RND| Mumbai|
| 4| Ejh Ruyg| 3500| | L5| Finance|Singapore|
| 4| Ejh Ruyg| 3500| | L6|Manufacturing| Beijing|
| 5|Efgw Ygefrf| 5400| | L1| Accounts| New York|
| 5|Efgw Ygefrf| 5400| | L2| Sales| London|
| 5|Efgw Ygefrf| 5400| | L3| Marketing| Dubai|
| 5|Efgw Ygefrf| 5400| | L4| RND| Mumbai|
| 5|Efgw Ygefrf| 5400| | L5| Finance|Singapore|
| 5|Efgw Ygefrf| 5400| | L6|Manufacturing| Beijing|
| 6| Fuyg Ruyoi| 4020| L5| L1| Accounts| New York|
| 6| Fuyg Ruyoi| 4020| L5| L2| Sales| London|
| 6| Fuyg Ruyoi| 4020| L5| L3| Marketing| Dubai|
| 6| Fuyg Ruyoi| 4020| L5| L4| RND| Mumbai|
| 6| Fuyg Ruyoi| 4020| L5| L5| Finance|Singapore|
| 6| Fuyg Ruyoi| 4020| L5| L6|Manufacturing| Beijing|
| 7| Gfgd Tefdf| 3500| L4| L1| Accounts| New York|
| 7| Gfgd Tefdf| 3500| L4| L2| Sales| London|
| 7| Gfgd Tefdf| 3500| L4| L3| Marketing| Dubai|
| 7| Gfgd Tefdf| 3500| L4| L4| RND| Mumbai|
| 7| Gfgd Tefdf| 3500| L4| L5| Finance|Singapore|
| 7| Gfgd Tefdf| 3500| L4| L6|Manufacturing| Beijing|
+-----+-----------+------+--------+-----------+-------------+---------+
>>> empdf.join(locdf,empdf.location == locdf.location_id, "inner").show()
+-----+----------+------+--------+-----------+----------+---------+
|empid| name|salary|location|location_id|department| city|
+-----+----------+------+--------+-----------+----------+---------+
| 1| Avc Ser| 2300| L1| L1| Accounts| New York|
| 7|Gfgd Tefdf| 3500| L4| L4| RND| Mumbai|
| 2| Kghf Jfjs| 3400| L2| L2| Sales| London|
| 3| Jgf Fjf| 2455| L2| L2| Sales| London|
| 6|Fuyg Ruyoi| 4020| L5| L5| Finance|Singapore|
+-----+----------+------+--------+-----------+----------+---------+
left or leftouter
>>> empdf.join(locdf,empdf.location == locdf.location_id, "left").show()
+-----+-----------+------+--------+-----------+----------+---------+
|empid| name|salary|location|location_id|department| city|
+-----+-----------+------+--------+-----------+----------+---------+
| 4| Ejh Ruyg| 3500| | null| null| null|
| 5|Efgw Ygefrf| 5400| | null| null| null|
| 1| Avc Ser| 2300| L1| L1| Accounts| New York|
| 7| Gfgd Tefdf| 3500| L4| L4| RND| Mumbai|
| 2| Kghf Jfjs| 3400| L2| L2| Sales| London|
| 3| Jgf Fjf| 2455| L2| L2| Sales| London|
| 6| Fuyg Ruyoi| 4020| L5| L5| Finance|Singapore|
+-----+-----------+------+--------+-----------+----------+---------+
right or rightouter
>>> empdf.join(locdf,empdf.location == locdf.location_id, "right").show()
+-----+----------+------+--------+-----------+-------------+---------+
|empid| name|salary|location|location_id| department| city|
+-----+----------+------+--------+-----------+-------------+---------+
| null| null| null| null| L6|Manufacturing| Beijing|
| 1| Avc Ser| 2300| L1| L1| Accounts| New York|
| 7|Gfgd Tefdf| 3500| L4| L4| RND| Mumbai|
| null| null| null| null| L3| Marketing| Dubai|
| 2| Kghf Jfjs| 3400| L2| L2| Sales| London|
| 3| Jgf Fjf| 2455| L2| L2| Sales| London|
| 6|Fuyg Ruyoi| 4020| L5| L5| Finance|Singapore|
+-----+----------+------+--------+-----------+-------------+---------+
full or fullouter or outer or full_outer
>>> empdf.join(locdf,empdf.location == locdf.location_id, "full").show()
+-----+-----------+------+--------+-----------+-------------+---------+
|empid| name|salary|location|location_id| department| city|
+-----+-----------+------+--------+-----------+-------------+---------+
| null| null| null| null| L6|Manufacturing| Beijing|
| 4| Ejh Ruyg| 3500| | null| null| null|
| 5|Efgw Ygefrf| 5400| | null| null| null|
| 1| Avc Ser| 2300| L1| L1| Accounts| New York|
| 7| Gfgd Tefdf| 3500| L4| L4| RND| Mumbai|
| null| null| null| null| L3| Marketing| Dubai|
| 2| Kghf Jfjs| 3400| L2| L2| Sales| London|
| 3| Jgf Fjf| 2455| L2| L2| Sales| London|
| 6| Fuyg Ruyoi| 4020| L5| L5| Finance|Singapore|
+-----+-----------+------+--------+-----------+-------------+---------+
|
|
|
Comments
Post a Comment