数据库学习笔记

举报
narrow22/7 发表于 2024/05/22 23:33:25 2024/05/22
【摘要】 --------------------------------hive的有用函数------------------------------substring_index:可以分割字符串:    SUBSTRING_INDEX(msg_time, ' ', 1) AS msg_date,    SUBSTRING_INDEX(SUBSTRING_INDEX(msg_time, ' ', -...
--------------------------------hive的有用函数----------------------------
--substring_index:可以分割字符串:

    SUBSTRING_INDEX(msg_time, ' ', 1) AS msg_date,
    SUBSTRING_INDEX(SUBSTRING_INDEX(msg_time, ' ', -1), ':', 1) AS msg_hour,

--CAST:转换字符类型
    CAST(SPLIT(receiver_gps, ' ')[0] AS DOUBLE) AS receiver_longitude,
    CAST(SPLIT(receiver_gps, ' ')[1] AS DOUBLE) AS receiver_latitude,

--SPLIT:根据所选内容分割

    CAST(SPLIT(receiver_gps, ' ')[0] AS DOUBLE) AS receiver_longitude,
    CAST(SPLIT(receiver_gps, ' ')[1] AS DOUBLE) AS receiver_latitude,

2,鲁班七号,54,木偶奇遇记:288-福禄兄弟:288-黑桃队长:60-电玩小子:2288-星空梦想:0
3,后裔,53,精灵王:288-阿尔法小队:588-辉光之辰:888-黄金射手座:1688-如梦令:1314
4,铠,52,龙域领主:288-曙光守护者:1776
5,韩信,52,飞衡:1788-逐梦之影:888-白龙吟:1188-教廷特使:0-街头霸王:888


--使用多种类型的表创建
create table heros(
    index int,
    name string,
    id int,
    price map<string,int>  --kv键值对类型
)
-- 定义行格式,使用逗号作为字段分隔符
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','  -- 分割不同数据类型
COLLECTION ITEMS TERMINATED BY '-'  -- 分割集合类型的元素
MAP KEYS TERMINATED BY ':'  -- 分割键值对类型的键
STORED AS TEXTFILE;  -- 存储格式为文本文件


SELECT *
FROM msg_etl
DISTRIBUTE BY sender_sex
SORT BY msg_hour ASC;


--hive子查询:
--hive子查询:from
SELECT changed_etl.sender_account,changed_etl.receiver_account from (
    select sender_account, receiver_account,msg_hour from msg_etl
    DISTRIBUTE by sender_account
    sort by msg_hour
    GROUP by sender_account
)changed_etl;

--where子查询:相关查询和不相关查询
--不相关查询:查询是否有收件人是第一个发送人的
SELECT *
FROM msg_etl
WHERE receiver_account IN (
    SELECT sender_name
    FROM msg_etl
    LIMIT 1
);

--相关查询:引用了括号内父元素的列:
SELECT DISTINCT sender_account
FROM msg_etl
WHERE EXISTS (
    SELECT receiver_account
    FROM msg_etl
    WHERE receiver_account = msg_etl.sender_account
);
---------------------------------------------------
--使用CTE来对结果进行计数:
WITH cte AS ( SELECT msg_hour,
sender_network
FROM msg_etl
WHERE msg_hour > 11 AND sender_network = '4G' )
SELECT COUNT(*) AS row_count FROM cte;
---------------------------------------------------
--hive join操作:
-- Inner Join Example
SELECT *
FROM table1
INNER JOIN table2
ON table1.id = table2.id;

-- Left Join Example
SELECT *
FROM table1
LEFT JOIN table2
ON table1.id = table2.id;

show functions
describe function ?;

---------------------------------hive 内置函数
--IF用来做二分判断
SELECT IF(sender_network = '5G', 1, 0) AS network_type
FROM msg_etl;

--nvl用来替换值
select nvl(NULL,"NULL_warining");

--coalesce

--casewhen
SELECT
    CASE WHEN sender_sex = 'male' THEN 0
         WHEN sender_sex = 'female' THEN 1
         ELSE NULL
    END AS gender,
    COUNT(*) AS count
FROM msg_etl
GROUP BY sender_sex;

--cast
cast(sth as type)

-------------------------------------
--hbase shell部分:
--重要命令:求助
help
help "COMMAND"

ddl数据库
namespace命名空间
dml操作数据
--查看命名空间:
list_namespace
create_namespace 'ns1',{'PROPERTY_NAME'=>'PROPERTY_VALUE'}
create namespace 'bigdata'

--ddl语言:用来展示创建修改表格
list
create 'ns1:t1' ,{NAME => 'info',VERSIONS => 5}, --f1是列族,ns1可以不写(在默认命名空间)
--VERSION版本,默认一个

--简单的表格名字+列族
create 'student','info','info1','msg'
--多个创建:
create 'bigdata1:person' ,{NAME => 'name',VERSIONS => 5} ,{NAME => 'card',VERSIONS => 5}
describe 'student';

--修改表格
help 'alter'
--修改版本号:
describe 'student'
alter 'student' , NAME => 'info', VERSIONS= '5'
--删除
help 'delete'
alter 'delete' => 'info1'
--注意多个维护版本时,delete和deleteall 的区别
delete 'ns1:t1', 'r1','c1' ,ts1 --ts1是时间,不需要很准确

--删除表格
disable 'drop'
drop 'student'

--hbase 写入数据:智能添加cell
create 'bigdata:student' , 'info','msg'
help put
--顺序
put 'namespace:表名', '行号r1', '列族c1:列名','值'
-----------------------------------
|              |    列族:info     |
-----------------------------------
|              |info:name|info:age|
-----------------------------------
|rowkey:1004   |   lisi  |   10   |
-----------------------------------
|.....                            |
put 'bigdata:student', '1001','info:name','zhangsan'
put 'bigdata:student', '1004','info:age','10'
put 'bigdata:student', '1004','info:name','lisi'
-- 读取数据
help 'get'
get 'bigdata:student','1004'
get 'bigdata:student','1004',{COLUMN => 'info:name'}
--scan:多行数据
scan 't1', {STARTROW => '1001'} --'左闭右开'

 -----------------------------------------------------
 --spark-sql
 --读取数据:本地地址
val lines = sc.textFile("file:///root/log1.txt")
lines.count()
lines.collect()

 --数据读取:hdfs
val lines1 = sc.textFile("hdfs://hacluster/user/stu01/input/cx_input_data1.txt")
val lines2 = sc.textFile("/user/stu01/input/cx_input_data1.txt ")

--rdd的排序操作:使用map算子
val rdd1 = sc.parallelize(List(5, 6, 4, 7, 3, 8, 2, 9, 1, 10))
//对rdd1里的每一个元素乘2然后排序
val rdd2 = rdd1.map(_ * 2).sortBy(x => x, true)
//过滤出大于等于5的元素
val rdd3 = rdd2.filter(_ >= 5)
//将元素以数组的方式在客户端显示
rdd3.collect

--常用算子:
//求交集
val rdd4 = rdd1.intersection(rdd2)
//去重
rdd3.distinct.collect
rdd4.collect


val rdd1 = sc.parallelize(List(("tom", 1), ("jerry", 3), ("kitty", 2)))
val rdd2 = sc.parallelize(List(("jerry", 2), ("tom", 1), ("shuke", 2)))
//求join
val rdd3 = rdd1.join(rdd2)
rdd3.collect
//求并集
val rdd4 = rdd1 union rdd2
rdd4.collect
//按key进行分组
val rdd5=rdd4.groupByKey
rdd5.collect

--dataFrame编程:
--结构层面—:StructType表结构和StructFileld描述是列信息
--数据层面:Row记录一行数据,Column记录一列数据
-------------------------------------------
                StructType               丨
-------------------------------------------
StructFileld  |StructFileld  |StructFileld|
-------------------------------------------
    id        |   name       |   age      |
-------------------------------------------
      1       |    张三      |    18      |
.........

--1、读取数据:
val lineRDD= sc.textFile("/user/stu01/cx_person.txt").map(_.split(" "))

--2、定义case class:也就是数据格式
case class Person(id:Int, name:String, age:Int)

--3、关联rdd到case class中:
val personRDD = lineRDD.map(x => Person(x(0).toInt, x(1), x(2).toInt))

--4、将rdd转化为df
val personDF = personRDD.toDF

--5、查看df信息
personDF.show
personDF.printSchema

---------------------------------
import org.apache.spark.sql.hive.HiveContext
val sqlContext = new HiveContext(sc)

val tableName = "msg_etl_hive"
val createTableQuery = """
    CREATE TABLE IF NOT EXISTS msg_etl_hive (
        msg_date STRING,
        msg_hour STRING,
        sender_name STRING,
        sender_account STRING,
        sender_sex STRING,
        sender_ip STRING,
        sender_os STRING,
        sender_phonetype STRING,
        sender_network STRING,
        sender_longitude DOUBLE,
        sender_latitude DOUBLE,
        receiver_name STRING,
        receiver_ip STRING,
        receiver_account STRING,
        receiver_os STRING,
        receiver_phonetype STRING,
        receiver_network STRING,
        receiver_longitude DOUBLE,
        receiver_latitude DOUBLE,
        receiver_sex STRING,
        msg_type STRING,
        distance STRING
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    STORED AS TEXTFILE
"""
sqlContext.sql(createTableQuery)

--转化为dsl语法风格的内容:
personDF.select(personDF.col("name")).show
personDF.select("name").show
personDF.select(col("id"), col("name"), col("age") + 1).show

--采用SQL方法的分析:使用的是regesterOrReplaceTempTable
personDF.registerTempTable("cx_t_person")

--然后就可以使用sql语法来进行命令操作
spark.sql("desc cx_t_person ").show
spark.sql("select * from cx_t_person where age > 30 ").show

-----------------sqoop内容-----------------------------------
source /opt/client/bigdata_env

sqoop list-database --connect jdbc:mysql://121.36.103.211/  
--username root –password SONGHUAWEI123@shw

sqoop list-database --connect jdbc:mysql://121.36.103.211:3306/rdsdb  
--username root –password SONGHUAWEI123@shw

sqoop import --connect jdbc:mysql://116.63.197.140:3306/resdb
--username root
--password xxx
--table cx_socker
--hive-import
--hive-table cx_table_socker
--delete-target-dir --fields-terminated-by "," -m 1 --as-textfile

sqoop import --connect jdbc:mysql:// 116.63.197.140:3306/resdb
--username root --password xxx --table 'cx_ socker',
--hbase-table 'cx_hbase_socker' --column-family info
--hbase-row-key id --hbase-create-table --m 1
【版权声明】本文为华为云社区用户原创内容,转载时必须标注文章的来源(华为云社区)、文章链接、文章作者等基本信息, 否则作者和本社区有权追究责任。如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱: cloudbbs@huaweicloud.com
  • 点赞
  • 收藏
  • 关注作者

评论(0

0/1000
抱歉,系统识别当前为高风险访问,暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称,即可参与社区互动!

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。