hive-complex-data-types

Hive Complex Data types
Hive Complex Data types

Complex Data types supported in Hive are
1. Array
2. Map
3. Struct
4. Nested Type


Datastructure which holds ordered homogeneous elements separated by commas

hive> create table offices_v1(office_id int, location string, departments array<string>)
row format delimited
fields terminated by ','
collection items terminated by '|'
lines terminated by '\n'
stored as textfile;
OK
Time taken: 0.958 seconds

data.csv
1,New York,HR|Sales
2,Sydney,Development|Research
3,London,Finance|Training|Sales
4,Dubai,Accounts|Sales|Research
5,Mumbai,Development|Support|Sales
6,Singapore,Development|Sales
7,Sydney,Sales|Marketing
8,Beijing,Research
9,Tokyo,Research|Sales|Development

hive> load data local inpath '/home/mkm/mm/data/office_ver1.csv' into table offices_v1;
Loading data to table hive_complex_types_db.offices_v1
OK
Time taken: 1.177 seconds
hive> select * from offices_v1;
OK
offices_v1.office_id	offices_v1.location	offices_v1.departments
1	New York	["HR","Sales"]
2	Sydney	["Development","Research"]
3	London	["Finance","Training","Sales"]
4	Dubai	["Accounts","Sales","Research"]
5	Mumbai	["Development","Support","Sales"]
6	Singapore	["Development","Sales"]
7	Sydney	["Sales","Marketing"]
8	Beijing	["Research"]
9	Tokyo	["Research","Sales","Development"]
Time taken: 0.453 seconds, Fetched: 9 row(s)

hive> select *, size(departments) as total_depts from offices_v1;
OK
offices_v1.office_id	offices_v1.location	offices_v1.departments	total_depts
1	New York	["HR","Sales"]	2
2	Sydney	["Development","Research"]	2
3	London	["Finance","Training","Sales"]	3
4	Dubai	["Accounts","Sales","Research"]	3
5	Mumbai	["Development","Support","Sales"]	3
6	Singapore	["Development","Sales"]	2
7	Sydney	["Sales","Marketing"]	2
8	Beijing	["Research"]	1
9	Tokyo	["Research","Sales","Development"]	3

hive> select * from offices_v1 where array_contains(departments,'Sales');
OK
1	New York	["HR","Sales"]
3	London	["Finance","Training","Sales"]
4	Dubai	["Accounts","Sales","Research"]
5	Mumbai	["Development","Support","Sales"]
6	Singapore	["Development","Sales"]
7	Sydney	["Sales","Marketing"]
9	Tokyo	["Research","Sales","Development"]
Time taken: 1.025 seconds, Fetched: 7 row(s)

hive> select explode(departments)  from offices_v1 where array_contains(departments,'Training');
OK
col
Finance
Training
Sales

hive> select office_id, location, dept from offices_v1 lateral view explode(departments) f as dept;
OK
office_id	location	dept
1	New York	HR
1	New York	Sales
2	Sydney	Development
2	Sydney	Research
3	London	Finance
3	London	Training
3	London	Sales
4	Dubai	Accounts
4	Dubai	Sales
4	Dubai	Research
5	Mumbai	Development
5	Mumbai	Support
5	Mumbai	Sales
6	Singapore	Development
6	Singapore	Sales
7	Sydney	Sales
7	Sydney	Marketing
8	Beijing	Research
9	Tokyo	Research
9	Tokyo	Sales
9	Tokyo	Development
Time taken: 0.178 seconds, Fetched: 21 row(s)

hive> select office_id, location, dept from offices_v1 lateral view explode(departments) f as dept where dept == 'Sales';
OK
office_id	location	dept
1	New York	Sales
3	London	Sales
4	Dubai	Sales
5	Mumbai	Sales
6	Singapore	Sales
7	Sydney	Sales
9	Tokyo	Sales
Time taken: 0.282 seconds, Fetched: 7 row(s)



Datastructure which holds unordered key-value pairs
Keys should be the primitive types

hive> create table offices_v2(office_id int, address string, location map<string,string>)
row format delimited
fields terminated by ","
collection items terminated by "|"
map keys terminated by "="
lines terminated by "\n"
stored as textfile;
OK
Time taken: 0.106 seconds


offices2.csv
1,New York,long=40.71|latd=-74.00
2,London,long=-0.127758|latd=51.507351
3,New Delhi,long=77.209023|latd=28.613939
4,Tokyo,lond=139.691711|latd=35.689487

hive> load data local inpath '/home/mkm/mm/data/offices_ver2.csv' into table offices_v2;
Loading data to table hive_complex_types_db.offices_v2
OK
Time taken: 1.717 seconds

hive> select * from offices_v2;
OK
offices_v2.office_id	offices_v2.address	offices_v2.location
1	New York	{"long":"40.71","latd":"-74.00"}
2	London	{"long":"-0.127758","latd":"51.507351"}
3	New Delhi	{"long":"77.209023","latd":"28.613939"}
4	Tokyo	{"long":"139.691711","latd":"35.689487"}

hive> select location['long'] as longitude,location['latd'] as latitude from offices_v2 where address='New York';
OK
longitude	latitude
40.71	-74.00
Time taken: 0.683 seconds, Fetched: 1 row(s)

hive> select office_id,address, location['long'] as longitude,location['latd'] as latitude from offices_v2;;
OK
office_id	address	longitude	latitude
1	New York	40.71	-74.00
2	London	-0.127758	51.507351
3	New Delhi	77.209023	28.613939
4	Tokyo	139.691711	35.689487
Time taken: 0.352 seconds, Fetched: 4 row(s)

hive> select map_keys(location) from offices_v2;
OK
_c0
["long","latd"]
["long","latd"]
["long","latd"]
["long","latd"]
Time taken: 0.481 seconds, Fetched: 4 row(s)
hive> select map_values(location) from offices_v2;
OK
_c0
["40.71","-74.00"]
["-0.127758","51.507351"]
["77.209023","28.613939"]
["139.691711","35.689487"]



complex datastructure, It is a collection of different data type variables

hive> create table offices_v3(office_id int,location string,contacts struct<contact:string, contact2:string>)
row format delimited
fields terminated by ','
collection items terminated by '|'
map keys terminated by '='
lines terminated by '\n'
stored as textfile;

offices4.csv

1,New York,dff|fgfg
2,Tokyo,fgbfg|zsdlkg
3,Sydney,ksdhf|lsfud

hive> select inline(array(struct('a',10),struct('b',20),struct('c',23)));
OK
a	10
b	20
c	23
Time taken: 0.342 seconds, Fetched: 3 row(s)

hive> select t.*,tf.* from (select 0) as t lateral view inline(array(struct('a',10),struct('b',20),struct('c',23))) tf;
OK
0	a	10
0	b	20
0	c	23
Time taken: 0.595 seconds, Fetched: 3 row(s)



complex datastructure, It is a collection of different data types nested or inner variables

hive> create table offices(office_id int,
office_name string,
departments array<string>,
contacts map<string,string>,
communication struct<call:string, fax:array<string>, support: map<string,string>>)
row format delimited
stored as textfile;

Inserting into Structs/Complex data structures/Nested columns
hive>insert into offices values(1,'GURGAON_ISPS3',array('HR','SALES'), map('primary','San Drti','secondary','Mui Fedur'), named_struct('call','84574857','fax',array('3473647','534353','54445'),'support',map('technical','343434343','general','3434343434')));

hive>insert into offices values(2,'HYDERABAD_LTPS1',array('RND','ACCOUNTS'), map('primary','Din Frueh','secondary','Ghah Murf'), named_struct('call','04084574857','fax',array('0403473647','040534353','04054445'),'support',map('technical','43431212','general','89998124')));

hive> select * from offices;
OK
1       GURGAON_ISPS3   ["HR","SALES"]  {"primary":"San Drti","secondary":"Mui Fedur"}  {"call":"84574857","fax":["3473647","534353","54445"],"support":{"technical":"343434343","general":"3434343434"}}
2	HYDERABAD_LTPS1	["RND","ACCOUNTS"]	{"primary":"Din Frueh","secondary":"Ghah Murf"}	{"call":"04084574857","fax":["0403473647","040534353","04054445"],"support":{"technical":"43431212","general":"89998124"}}
Time taken: 0.384 seconds, Fetched: 2 row(s)

hive> desc offices;
OK
office_id           	int
office_name         	string
departments         	array<string>
contacts            	map<string,string>
communication       	struct<call:string,fax:array<string>,support:map<string,string>>
Time taken: 0.088 seconds, Fetched: 5 row(s)

hive> select office_name, departments[0] as primary_dept from offices;
OK
GURGAON_ISPS3	HR
HYDERABAD_LTPS1	RND
Time taken: 0.395 seconds, Fetched: 2 row(s)

hive> select office_name, departments[0] as primary_dept, contacts['primary'] as primary_contact from offices;
OK
office_name	primary_dept	primary_contact
GURGAON_ISPS3	HR	San Drti
HYDERABAD_LTPS1	RND	Din Frueh

hive> select office_name, communication.call as office_contact, communication.fax[0] as primary_fax, communication.support['general'] as general_support  from offices;
OK
office_name	office_contact	primary_fax	general_support
GURGAON_ISPS3	84574857	3473647	3434343434
HYDERABAD_LTPS1	04084574857	0403473647	89998124
Time taken: 0.281 seconds, Fetched: 2 row(s)

Compiled on WEDNESDAY, 31-JULY-2024, 08:59:56 PM IST

Comments

Popular posts from this blog

hadoop-installation-ubuntu

jenv-tool

hive-installation-in-ubuntu