hive-complex-data-types

hive-complex-data-types

Hive Complex Data types

Hive Complex Data types

Complex Data types supported in Hive are 1. Array 2. Map 3. Struct 4. Nested Type
1. Array Datastructure which holds ordered homogeneous elements separated by commas hive> create table offices_v1(office_id int, location string, departments array<string>) row format delimited fields terminated by ',' collection items terminated by '\|' lines terminated by '\n' stored as textfile; OK Time taken: 0.958 seconds data.csv 1,New York,HR\|Sales 2,Sydney,Development\|Research 3,London,Finance\|Training\|Sales 4,Dubai,Accounts\|Sales\|Research 5,Mumbai,Development\|Support\|Sales 6,Singapore,Development\|Sales 7,Sydney,Sales\|Marketing 8,Beijing,Research 9,Tokyo,Research\|Sales\|Development hive> load data local inpath '/home/mkm/mm/data/office_ver1.csv' into table offices_v1; Loading data to table hive_complex_types_db.offices_v1 OK Time taken: 1.177 seconds hive> select * from offices_v1; OK offices_v1.office_id offices_v1.location offices_v1.departments 1 New York ["HR","Sales"] 2 Sydney ["Development","Research"] 3 London ["Finance","Training","Sales"] 4 Dubai ["Accounts","Sales","Research"] 5 Mumbai ["Development","Support","Sales"] 6 Singapore ["Development","Sales"] 7 Sydney ["Sales","Marketing"] 8 Beijing ["Research"] 9 Tokyo ["Research","Sales","Development"] Time taken: 0.453 seconds, Fetched: 9 row(s) hive> select , size(departments) as total_depts from offices_v1; OK offices_v1.office_id offices_v1.location offices_v1.departments total_depts 1 New York ["HR","Sales"] 2 2 Sydney ["Development","Research"] 2 3 London ["Finance","Training","Sales"] 3 4 Dubai ["Accounts","Sales","Research"] 3 5 Mumbai ["Development","Support","Sales"] 3 6 Singapore ["Development","Sales"] 2 7 Sydney ["Sales","Marketing"] 2 8 Beijing ["Research"] 1 9 Tokyo ["Research","Sales","Development"] 3 hive> select from offices_v1 where array_contains(departments,'Sales'); OK 1 New York ["HR","Sales"] 3 London ["Finance","Training","Sales"] 4 Dubai ["Accounts","Sales","Research"] 5 Mumbai ["Development","Support","Sales"] 6 Singapore ["Development","Sales"] 7 Sydney ["Sales","Marketing"] 9 Tokyo ["Research","Sales","Development"] Time taken: 1.025 seconds, Fetched: 7 row(s) hive> select explode(departments) from offices_v1 where array_contains(departments,'Training'); OK col Finance Training Sales hive> select office_id, location, dept from offices_v1 lateral view explode(departments) f as dept; OK office_id location dept 1 New York HR 1 New York Sales 2 Sydney Development 2 Sydney Research 3 London Finance 3 London Training 3 London Sales 4 Dubai Accounts 4 Dubai Sales 4 Dubai Research 5 Mumbai Development 5 Mumbai Support 5 Mumbai Sales 6 Singapore Development 6 Singapore Sales 7 Sydney Sales 7 Sydney Marketing 8 Beijing Research 9 Tokyo Research 9 Tokyo Sales 9 Tokyo Development Time taken: 0.178 seconds, Fetched: 21 row(s) hive> select office_id, location, dept from offices_v1 lateral view explode(departments) f as dept where dept == 'Sales'; OK office_id location dept 1 New York Sales 3 London Sales 4 Dubai Sales 5 Mumbai Sales 6 Singapore Sales 7 Sydney Sales 9 Tokyo Sales Time taken: 0.282 seconds, Fetched: 7 row(s)
2. Map Datastructure which holds unordered key-value pairs Keys should be the primitive types hive> create table offices_v2(office_id int, address string, location map<string,string>) row format delimited fields terminated by "," collection items terminated by "\|" map keys terminated by "=" lines terminated by "\n" stored as textfile; OK Time taken: 0.106 seconds offices2.csv 1,New York,long=40.71\|latd=-74.00 2,London,long=-0.127758\|latd=51.507351 3,New Delhi,long=77.209023\|latd=28.613939 4,Tokyo,lond=139.691711\|latd=35.689487 hive> load data local inpath '/home/mkm/mm/data/offices_ver2.csv' into table offices_v2; Loading data to table hive_complex_types_db.offices_v2 OK Time taken: 1.717 seconds hive> select * from offices_v2; OK offices_v2.office_id offices_v2.address offices_v2.location 1 New York {"long":"40.71","latd":"-74.00"} 2 London {"long":"-0.127758","latd":"51.507351"} 3 New Delhi {"long":"77.209023","latd":"28.613939"} 4 Tokyo {"long":"139.691711","latd":"35.689487"} hive> select location['long'] as longitude,location['latd'] as latitude from offices_v2 where address='New York'; OK longitude latitude 40.71 -74.00 Time taken: 0.683 seconds, Fetched: 1 row(s) hive> select office_id,address, location['long'] as longitude,location['latd'] as latitude from offices_v2;; OK office_id address longitude latitude 1 New York 40.71 -74.00 2 London -0.127758 51.507351 3 New Delhi 77.209023 28.613939 4 Tokyo 139.691711 35.689487 Time taken: 0.352 seconds, Fetched: 4 row(s) hive> select map_keys(location) from offices_v2; OK _c0 ["long","latd"] ["long","latd"] ["long","latd"] ["long","latd"] Time taken: 0.481 seconds, Fetched: 4 row(s) hive> select map_values(location) from offices_v2; OK _c0 ["40.71","-74.00"] ["-0.127758","51.507351"] ["77.209023","28.613939"] ["139.691711","35.689487"]
3. Struct complex datastructure, It is a collection of different data type variables hive> create table offices_v3(office_id int,location string,contacts struct<contact:string, contact2:string>) row format delimited fields terminated by ',' collection items terminated by '\|' map keys terminated by '=' lines terminated by '\n' stored as textfile; offices4.csv 1,New York,dff\|fgfg 2,Tokyo,fgbfg\|zsdlkg 3,Sydney,ksdhf\|lsfud hive> select inline(array(struct('a',10),struct('b',20),struct('c',23))); OK a 10 b 20 c 23 Time taken: 0.342 seconds, Fetched: 3 row(s) hive> select t.,tf. from (select 0) as t lateral view inline(array(struct('a',10),struct('b',20),struct('c',23))) tf; OK 0 a 10 0 b 20 0 c 23 Time taken: 0.595 seconds, Fetched: 3 row(s)
4. Complex Type/Nested Data types complex datastructure, It is a collection of different data types nested or inner variables hive> create table offices(office_id int, office_name string, departments array<string>, contacts map<string,string>, communication struct<call:string, fax:array<string>, support: map<string,string>>) row format delimited stored as textfile; Inserting into Structs/Complex data structures/Nested columns hive>insert into offices values(1,'GURGAON_ISPS3',array('HR','SALES'), map('primary','San Drti','secondary','Mui Fedur'), named_struct('call','84574857','fax',array('3473647','534353','54445'),'support',map('technical','343434343','general','3434343434'))); hive>insert into offices values(2,'HYDERABAD_LTPS1',array('RND','ACCOUNTS'), map('primary','Din Frueh','secondary','Ghah Murf'), named_struct('call','04084574857','fax',array('0403473647','040534353','04054445'),'support',map('technical','43431212','general','89998124'))); hive> select * from offices; OK 1 GURGAON_ISPS3 ["HR","SALES"] {"primary":"San Drti","secondary":"Mui Fedur"} {"call":"84574857","fax":["3473647","534353","54445"],"support":{"technical":"343434343","general":"3434343434"}} 2 HYDERABAD_LTPS1 ["RND","ACCOUNTS"] {"primary":"Din Frueh","secondary":"Ghah Murf"} {"call":"04084574857","fax":["0403473647","040534353","04054445"],"support":{"technical":"43431212","general":"89998124"}} Time taken: 0.384 seconds, Fetched: 2 row(s) hive> desc offices; OK office_id int office_name string departments array<string> contacts map<string,string> communication struct<call:string,fax:array<string>,support:map<string,string>> Time taken: 0.088 seconds, Fetched: 5 row(s) hive> select office_name, departments[0] as primary_dept from offices; OK GURGAON_ISPS3 HR HYDERABAD_LTPS1 RND Time taken: 0.395 seconds, Fetched: 2 row(s) hive> select office_name, departments[0] as primary_dept, contacts['primary'] as primary_contact from offices; OK office_name primary_dept primary_contact GURGAON_ISPS3 HR San Drti HYDERABAD_LTPS1 RND Din Frueh hive> select office_name, communication.call as office_contact, communication.fax[0] as primary_fax, communication.support['general'] as general_support from offices; OK office_name office_contact primary_fax general_support GURGAON_ISPS3 84574857 3473647 3434343434 HYDERABAD_LTPS1 04084574857 0403473647 89998124 Time taken: 0.281 seconds, Fetched: 2 row(s)
Compiled on WEDNESDAY, 31-JULY-2024, 08:59:56 PM IST

Comments