数据库学习笔记
【摘要】 --------------------------------hive的有用函数------------------------------substring_index:可以分割字符串: SUBSTRING_INDEX(msg_time, ' ', 1) AS msg_date, SUBSTRING_INDEX(SUBSTRING_INDEX(msg_time, ' ', -...
--------------------------------hive的有用函数----------------------------
--substring_index:可以分割字符串:
SUBSTRING_INDEX(msg_time, ' ', 1) AS msg_date,
SUBSTRING_INDEX(SUBSTRING_INDEX(msg_time, ' ', -1), ':', 1) AS msg_hour,
--CAST:转换字符类型
CAST(SPLIT(receiver_gps, ' ')[0] AS DOUBLE) AS receiver_longitude,
CAST(SPLIT(receiver_gps, ' ')[1] AS DOUBLE) AS receiver_latitude,
--SPLIT:根据所选内容分割
CAST(SPLIT(receiver_gps, ' ')[0] AS DOUBLE) AS receiver_longitude,
CAST(SPLIT(receiver_gps, ' ')[1] AS DOUBLE) AS receiver_latitude,
2,鲁班七号,54,木偶奇遇记:288-福禄兄弟:288-黑桃队长:60-电玩小子:2288-星空梦想:0
3,后裔,53,精灵王:288-阿尔法小队:588-辉光之辰:888-黄金射手座:1688-如梦令:1314
4,铠,52,龙域领主:288-曙光守护者:1776
5,韩信,52,飞衡:1788-逐梦之影:888-白龙吟:1188-教廷特使:0-街头霸王:888
--使用多种类型的表创建
create table heros(
index int,
name string,
id int,
price map<string,int> --kv键值对类型
)
-- 定义行格式,使用逗号作为字段分隔符
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',' -- 分割不同数据类型
COLLECTION ITEMS TERMINATED BY '-' -- 分割集合类型的元素
MAP KEYS TERMINATED BY ':' -- 分割键值对类型的键
STORED AS TEXTFILE; -- 存储格式为文本文件
SELECT *
FROM msg_etl
DISTRIBUTE BY sender_sex
SORT BY msg_hour ASC;
--hive子查询:
--hive子查询:from
SELECT changed_etl.sender_account,changed_etl.receiver_account from (
select sender_account, receiver_account,msg_hour from msg_etl
DISTRIBUTE by sender_account
sort by msg_hour
GROUP by sender_account
)changed_etl;
--where子查询:相关查询和不相关查询
--不相关查询:查询是否有收件人是第一个发送人的
SELECT *
FROM msg_etl
WHERE receiver_account IN (
SELECT sender_name
FROM msg_etl
LIMIT 1
);
--相关查询:引用了括号内父元素的列:
SELECT DISTINCT sender_account
FROM msg_etl
WHERE EXISTS (
SELECT receiver_account
FROM msg_etl
WHERE receiver_account = msg_etl.sender_account
);
---------------------------------------------------
--使用CTE来对结果进行计数:
WITH cte AS ( SELECT msg_hour,
sender_network
FROM msg_etl
WHERE msg_hour > 11 AND sender_network = '4G' )
SELECT COUNT(*) AS row_count FROM cte;
---------------------------------------------------
--hive join操作:
-- Inner Join Example
SELECT *
FROM table1
INNER JOIN table2
ON table1.id = table2.id;
-- Left Join Example
SELECT *
FROM table1
LEFT JOIN table2
ON table1.id = table2.id;
show functions
describe function ?;
---------------------------------hive 内置函数
--IF用来做二分判断
SELECT IF(sender_network = '5G', 1, 0) AS network_type
FROM msg_etl;
--nvl用来替换值
select nvl(NULL,"NULL_warining");
--coalesce
--casewhen
SELECT
CASE WHEN sender_sex = 'male' THEN 0
WHEN sender_sex = 'female' THEN 1
ELSE NULL
END AS gender,
COUNT(*) AS count
FROM msg_etl
GROUP BY sender_sex;
--cast
cast(sth as type)
-------------------------------------
--hbase shell部分:
--重要命令:求助
help
help "COMMAND"
ddl数据库
namespace命名空间
dml操作数据
--查看命名空间:
list_namespace
create_namespace 'ns1',{'PROPERTY_NAME'=>'PROPERTY_VALUE'}
create namespace 'bigdata'
--ddl语言:用来展示创建修改表格
list
create 'ns1:t1' ,{NAME => 'info',VERSIONS => 5}, --f1是列族,ns1可以不写(在默认命名空间)
--VERSION版本,默认一个
--简单的表格名字+列族
create 'student','info','info1','msg'
--多个创建:
create 'bigdata1:person' ,{NAME => 'name',VERSIONS => 5} ,{NAME => 'card',VERSIONS => 5}
describe 'student';
--修改表格
help 'alter'
--修改版本号:
describe 'student'
alter 'student' , NAME => 'info', VERSIONS= '5'
--删除
help 'delete'
alter 'delete' => 'info1'
--注意多个维护版本时,delete和deleteall 的区别
delete 'ns1:t1', 'r1','c1' ,ts1 --ts1是时间,不需要很准确
--删除表格
disable 'drop'
drop 'student'
--hbase 写入数据:智能添加cell
create 'bigdata:student' , 'info','msg'
help put
--顺序
put 'namespace:表名', '行号r1', '列族c1:列名','值'
-----------------------------------
| | 列族:info |
-----------------------------------
| |info:name|info:age|
-----------------------------------
|rowkey:1004 | lisi | 10 |
-----------------------------------
|..... |
put 'bigdata:student', '1001','info:name','zhangsan'
put 'bigdata:student', '1004','info:age','10'
put 'bigdata:student', '1004','info:name','lisi'
-- 读取数据
help 'get'
get 'bigdata:student','1004'
get 'bigdata:student','1004',{COLUMN => 'info:name'}
--scan:多行数据
scan 't1', {STARTROW => '1001'} --'左闭右开'
-----------------------------------------------------
--spark-sql
--读取数据:本地地址
val lines = sc.textFile("file:///root/log1.txt")
lines.count()
lines.collect()
--数据读取:hdfs
val lines1 = sc.textFile("hdfs://hacluster/user/stu01/input/cx_input_data1.txt")
val lines2 = sc.textFile("/user/stu01/input/cx_input_data1.txt ")
--rdd的排序操作:使用map算子
val rdd1 = sc.parallelize(List(5, 6, 4, 7, 3, 8, 2, 9, 1, 10))
//对rdd1里的每一个元素乘2然后排序
val rdd2 = rdd1.map(_ * 2).sortBy(x => x, true)
//过滤出大于等于5的元素
val rdd3 = rdd2.filter(_ >= 5)
//将元素以数组的方式在客户端显示
rdd3.collect
--常用算子:
//求交集
val rdd4 = rdd1.intersection(rdd2)
//去重
rdd3.distinct.collect
rdd4.collect
val rdd1 = sc.parallelize(List(("tom", 1), ("jerry", 3), ("kitty", 2)))
val rdd2 = sc.parallelize(List(("jerry", 2), ("tom", 1), ("shuke", 2)))
//求join
val rdd3 = rdd1.join(rdd2)
rdd3.collect
//求并集
val rdd4 = rdd1 union rdd2
rdd4.collect
//按key进行分组
val rdd5=rdd4.groupByKey
rdd5.collect
--dataFrame编程:
--结构层面—:StructType表结构和StructFileld描述是列信息
--数据层面:Row记录一行数据,Column记录一列数据
-------------------------------------------
StructType 丨
-------------------------------------------
StructFileld |StructFileld |StructFileld|
-------------------------------------------
id | name | age |
-------------------------------------------
1 | 张三 | 18 |
.........
--1、读取数据:
val lineRDD= sc.textFile("/user/stu01/cx_person.txt").map(_.split(" "))
--2、定义case class:也就是数据格式
case class Person(id:Int, name:String, age:Int)
--3、关联rdd到case class中:
val personRDD = lineRDD.map(x => Person(x(0).toInt, x(1), x(2).toInt))
--4、将rdd转化为df
val personDF = personRDD.toDF
--5、查看df信息
personDF.show
personDF.printSchema
---------------------------------
import org.apache.spark.sql.hive.HiveContext
val sqlContext = new HiveContext(sc)
val tableName = "msg_etl_hive"
val createTableQuery = """
CREATE TABLE IF NOT EXISTS msg_etl_hive (
msg_date STRING,
msg_hour STRING,
sender_name STRING,
sender_account STRING,
sender_sex STRING,
sender_ip STRING,
sender_os STRING,
sender_phonetype STRING,
sender_network STRING,
sender_longitude DOUBLE,
sender_latitude DOUBLE,
receiver_name STRING,
receiver_ip STRING,
receiver_account STRING,
receiver_os STRING,
receiver_phonetype STRING,
receiver_network STRING,
receiver_longitude DOUBLE,
receiver_latitude DOUBLE,
receiver_sex STRING,
msg_type STRING,
distance STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
"""
sqlContext.sql(createTableQuery)
--转化为dsl语法风格的内容:
personDF.select(personDF.col("name")).show
personDF.select("name").show
personDF.select(col("id"), col("name"), col("age") + 1).show
--采用SQL方法的分析:使用的是regesterOrReplaceTempTable
personDF.registerTempTable("cx_t_person")
--然后就可以使用sql语法来进行命令操作
spark.sql("desc cx_t_person ").show
spark.sql("select * from cx_t_person where age > 30 ").show
-----------------sqoop内容-----------------------------------
source /opt/client/bigdata_env
sqoop list-database --connect jdbc:mysql://121.36.103.211/
--username root –password SONGHUAWEI123@shw
sqoop list-database --connect jdbc:mysql://121.36.103.211:3306/rdsdb
--username root –password SONGHUAWEI123@shw
sqoop import --connect jdbc:mysql://116.63.197.140:3306/resdb
--username root
--password xxx
--table cx_socker
--hive-import
--hive-table cx_table_socker
--delete-target-dir --fields-terminated-by "," -m 1 --as-textfile
sqoop import --connect jdbc:mysql:// 116.63.197.140:3306/resdb
--username root --password xxx --table 'cx_ socker',
--hbase-table 'cx_hbase_socker' --column-family info
--hbase-row-key id --hbase-create-table --m 1
【版权声明】本文为华为云社区用户原创内容,转载时必须标注文章的来源(华为云社区)、文章链接、文章作者等基本信息, 否则作者和本社区有权追究责任。如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱:
cloudbbs@huaweicloud.com
- 点赞
- 收藏
- 关注作者
评论(0)