|
|
Complex Data types supported in Hive are 1. Array 2. Map 3. Struct 4. Nested Type
|
Datastructure which holds ordered homogeneous elements separated by commas
hive> create table offices_v1(office_id int, location string, departments array<string>)
row format delimited
fields terminated by ','
collection items terminated by '|'
lines terminated by '\n'
stored as textfile;
OK
Time taken: 0.958 seconds
data.csv
1,New York,HR|Sales
2,Sydney,Development|Research
3,London,Finance|Training|Sales
4,Dubai,Accounts|Sales|Research
5,Mumbai,Development|Support|Sales
6,Singapore,Development|Sales
7,Sydney,Sales|Marketing
8,Beijing,Research
9,Tokyo,Research|Sales|Development
hive> load data local inpath '/home/mkm/mm/data/office_ver1.csv' into table offices_v1;
Loading data to table hive_complex_types_db.offices_v1
OK
Time taken: 1.177 seconds
hive> select * from offices_v1;
OK
offices_v1.office_id offices_v1.location offices_v1.departments
1 New York ["HR","Sales"]
2 Sydney ["Development","Research"]
3 London ["Finance","Training","Sales"]
4 Dubai ["Accounts","Sales","Research"]
5 Mumbai ["Development","Support","Sales"]
6 Singapore ["Development","Sales"]
7 Sydney ["Sales","Marketing"]
8 Beijing ["Research"]
9 Tokyo ["Research","Sales","Development"]
Time taken: 0.453 seconds, Fetched: 9 row(s)
hive> select *, size(departments) as total_depts from offices_v1;
OK
offices_v1.office_id offices_v1.location offices_v1.departments total_depts
1 New York ["HR","Sales"] 2
2 Sydney ["Development","Research"] 2
3 London ["Finance","Training","Sales"] 3
4 Dubai ["Accounts","Sales","Research"] 3
5 Mumbai ["Development","Support","Sales"] 3
6 Singapore ["Development","Sales"] 2
7 Sydney ["Sales","Marketing"] 2
8 Beijing ["Research"] 1
9 Tokyo ["Research","Sales","Development"] 3
hive> select * from offices_v1 where array_contains(departments,'Sales');
OK
1 New York ["HR","Sales"]
3 London ["Finance","Training","Sales"]
4 Dubai ["Accounts","Sales","Research"]
5 Mumbai ["Development","Support","Sales"]
6 Singapore ["Development","Sales"]
7 Sydney ["Sales","Marketing"]
9 Tokyo ["Research","Sales","Development"]
Time taken: 1.025 seconds, Fetched: 7 row(s)
hive> select explode(departments) from offices_v1 where array_contains(departments,'Training');
OK
col
Finance
Training
Sales
hive> select office_id, location, dept from offices_v1 lateral view explode(departments) f as dept;
OK
office_id location dept
1 New York HR
1 New York Sales
2 Sydney Development
2 Sydney Research
3 London Finance
3 London Training
3 London Sales
4 Dubai Accounts
4 Dubai Sales
4 Dubai Research
5 Mumbai Development
5 Mumbai Support
5 Mumbai Sales
6 Singapore Development
6 Singapore Sales
7 Sydney Sales
7 Sydney Marketing
8 Beijing Research
9 Tokyo Research
9 Tokyo Sales
9 Tokyo Development
Time taken: 0.178 seconds, Fetched: 21 row(s)
hive> select office_id, location, dept from offices_v1 lateral view explode(departments) f as dept where dept == 'Sales';
OK
office_id location dept
1 New York Sales
3 London Sales
4 Dubai Sales
5 Mumbai Sales
6 Singapore Sales
7 Sydney Sales
9 Tokyo Sales
Time taken: 0.282 seconds, Fetched: 7 row(s)
|
Datastructure which holds unordered key-value pairs
Keys should be the primitive types
hive> create table offices_v2(office_id int, address string, location map<string,string>)
row format delimited
fields terminated by ","
collection items terminated by "|"
map keys terminated by "="
lines terminated by "\n"
stored as textfile;
OK
Time taken: 0.106 seconds
offices2.csv
1,New York,long=40.71|latd=-74.00
2,London,long=-0.127758|latd=51.507351
3,New Delhi,long=77.209023|latd=28.613939
4,Tokyo,lond=139.691711|latd=35.689487
hive> load data local inpath '/home/mkm/mm/data/offices_ver2.csv' into table offices_v2;
Loading data to table hive_complex_types_db.offices_v2
OK
Time taken: 1.717 seconds
hive> select * from offices_v2;
OK
offices_v2.office_id offices_v2.address offices_v2.location
1 New York {"long":"40.71","latd":"-74.00"}
2 London {"long":"-0.127758","latd":"51.507351"}
3 New Delhi {"long":"77.209023","latd":"28.613939"}
4 Tokyo {"long":"139.691711","latd":"35.689487"}
hive> select location['long'] as longitude,location['latd'] as latitude from offices_v2 where address='New York';
OK
longitude latitude
40.71 -74.00
Time taken: 0.683 seconds, Fetched: 1 row(s)
hive> select office_id,address, location['long'] as longitude,location['latd'] as latitude from offices_v2;;
OK
office_id address longitude latitude
1 New York 40.71 -74.00
2 London -0.127758 51.507351
3 New Delhi 77.209023 28.613939
4 Tokyo 139.691711 35.689487
Time taken: 0.352 seconds, Fetched: 4 row(s)
hive> select map_keys(location) from offices_v2;
OK
_c0
["long","latd"]
["long","latd"]
["long","latd"]
["long","latd"]
Time taken: 0.481 seconds, Fetched: 4 row(s)
hive> select map_values(location) from offices_v2;
OK
_c0
["40.71","-74.00"]
["-0.127758","51.507351"]
["77.209023","28.613939"]
["139.691711","35.689487"]
|
complex datastructure, It is a collection of different data type variables
hive> create table offices_v3(office_id int,location string,contacts struct<contact:string, contact2:string>)
row format delimited
fields terminated by ','
collection items terminated by '|'
map keys terminated by '='
lines terminated by '\n'
stored as textfile;
offices4.csv
1,New York,dff|fgfg
2,Tokyo,fgbfg|zsdlkg
3,Sydney,ksdhf|lsfud
hive> select inline(array(struct('a',10),struct('b',20),struct('c',23)));
OK
a 10
b 20
c 23
Time taken: 0.342 seconds, Fetched: 3 row(s)
hive> select t.*,tf.* from (select 0) as t lateral view inline(array(struct('a',10),struct('b',20),struct('c',23))) tf;
OK
0 a 10
0 b 20
0 c 23
Time taken: 0.595 seconds, Fetched: 3 row(s)
|
complex datastructure, It is a collection of different data types nested or inner variables
hive> create table offices(office_id int,
office_name string,
departments array<string>,
contacts map<string,string>,
communication struct<call:string, fax:array<string>, support: map<string,string>>)
row format delimited
stored as textfile;
Inserting into Structs/Complex data structures/Nested columns
hive>insert into offices values(1,'GURGAON_ISPS3',array('HR','SALES'), map('primary','San Drti','secondary','Mui Fedur'), named_struct('call','84574857','fax',array('3473647','534353','54445'),'support',map('technical','343434343','general','3434343434')));
hive>insert into offices values(2,'HYDERABAD_LTPS1',array('RND','ACCOUNTS'), map('primary','Din Frueh','secondary','Ghah Murf'), named_struct('call','04084574857','fax',array('0403473647','040534353','04054445'),'support',map('technical','43431212','general','89998124')));
hive> select * from offices;
OK
1 GURGAON_ISPS3 ["HR","SALES"] {"primary":"San Drti","secondary":"Mui Fedur"} {"call":"84574857","fax":["3473647","534353","54445"],"support":{"technical":"343434343","general":"3434343434"}}
2 HYDERABAD_LTPS1 ["RND","ACCOUNTS"] {"primary":"Din Frueh","secondary":"Ghah Murf"} {"call":"04084574857","fax":["0403473647","040534353","04054445"],"support":{"technical":"43431212","general":"89998124"}}
Time taken: 0.384 seconds, Fetched: 2 row(s)
hive> desc offices;
OK
office_id int
office_name string
departments array<string>
contacts map<string,string>
communication struct<call:string,fax:array<string>,support:map<string,string>>
Time taken: 0.088 seconds, Fetched: 5 row(s)
hive> select office_name, departments[0] as primary_dept from offices;
OK
GURGAON_ISPS3 HR
HYDERABAD_LTPS1 RND
Time taken: 0.395 seconds, Fetched: 2 row(s)
hive> select office_name, departments[0] as primary_dept, contacts['primary'] as primary_contact from offices;
OK
office_name primary_dept primary_contact
GURGAON_ISPS3 HR San Drti
HYDERABAD_LTPS1 RND Din Frueh
hive> select office_name, communication.call as office_contact, communication.fax[0] as primary_fax, communication.support['general'] as general_support from offices;
OK
office_name office_contact primary_fax general_support
GURGAON_ISPS3 84574857 3473647 3434343434
HYDERABAD_LTPS1 04084574857 0403473647 89998124
Time taken: 0.281 seconds, Fetched: 2 row(s)
|
|
|
Comments
Post a Comment