六十三、Spark-读取数据并写入数据库

举报
托马斯-酷涛 发表于 2022/05/26 01:20:25 2022/05/26
【摘要】 支持的数据源-JDBC 需求说明:使用Spark流式计算 将数据写入MySQL,并读取数据库信息进行打印 文章目录 支持的数据源-JDBC 项目主体架构 pom.xml依赖 创建数据库 业务逻辑 完整代码 程序运行 项目总结 项目主体架构 pom.xml依赖 ...

支持的数据源-JDBC

需求说明:使用Spark流式计算 将数据写入MySQL,并读取数据库信息进行打印

文章目录

支持的数据源-JDBC

项目主体架构

pom.xml依赖

创建数据库

业务逻辑

完整代码

程序运行

项目总结


项目主体架构

pom.xml依赖


   
  1. <?xml version="1.0" encoding="UTF-8"?>
  2. <project xmlns="http://maven.apache.org/POM/4.0.0"
  3. xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4. xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5. <modelVersion>4.0.0</modelVersion>
  6. <groupId>cn.itcast</groupId>
  7. <artifactId>SparkDemo</artifactId>
  8. <version>1.0-SNAPSHOT</version>
  9. <repositories>
  10. <repository>
  11. <id>aliyun</id>
  12. <url>http://maven.aliyun.com/nexus/content/groups/public/</url>
  13. </repository>
  14. <repository>
  15. <id>apache</id>
  16. <url>https://repository.apache.org/content/repositories/snapshots/</url>
  17. </repository>
  18. <repository>
  19. <id>cloudera</id>
  20. <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
  21. </repository>
  22. </repositories>
  23. <properties>
  24. <encoding>UTF-8</encoding>
  25. <maven.compiler.source>1.8</maven.compiler.source>
  26. <maven.compiler.target>1.8</maven.compiler.target>
  27. <scala.version>2.12.11</scala.version>
  28. <spark.version>3.0.1</spark.version>
  29. <hadoop.version>2.7.5</hadoop.version>
  30. </properties>
  31. <dependencies>
  32. <!--依赖Scala语言-->
  33. <dependency>
  34. <groupId>org.scala-lang</groupId>
  35. <artifactId>scala-library</artifactId>
  36. <version>${scala.version}</version>
  37. </dependency>
  38. <!--SparkCore依赖-->
  39. <dependency>
  40. <groupId>org.apache.spark</groupId>
  41. <artifactId>spark-core_2.12</artifactId>
  42. <version>${spark.version}</version>
  43. </dependency>
  44. <!-- spark-streaming-->
  45. <dependency>
  46. <groupId>org.apache.spark</groupId>
  47. <artifactId>spark-streaming_2.12</artifactId>
  48. <version>${spark.version}</version>
  49. </dependency>
  50. <!--spark-streaming+Kafka依赖-->
  51. <dependency>
  52. <groupId>org.apache.spark</groupId>
  53. <artifactId>spark-streaming-kafka-0-10_2.12</artifactId>
  54. <version>${spark.version}</version>
  55. </dependency>
  56. <!--SparkSQL依赖-->
  57. <dependency>
  58. <groupId>org.apache.spark</groupId>
  59. <artifactId>spark-sql_2.12</artifactId>
  60. <version>${spark.version}</version>
  61. </dependency>
  62. <!--SparkSQL+ Hive依赖-->
  63. <dependency>
  64. <groupId>org.apache.spark</groupId>
  65. <artifactId>spark-hive_2.12</artifactId>
  66. <version>${spark.version}</version>
  67. </dependency>
  68. <dependency>
  69. <groupId>org.apache.spark</groupId>
  70. <artifactId>spark-hive-thriftserver_2.12</artifactId>
  71. <version>${spark.version}</version>
  72. </dependency>
  73. <!--StructuredStreaming+Kafka依赖-->
  74. <dependency>
  75. <groupId>org.apache.spark</groupId>
  76. <artifactId>spark-sql-kafka-0-10_2.12</artifactId>
  77. <version>${spark.version}</version>
  78. </dependency>
  79. <!-- SparkMlLib机器学习模块,里面有ALS推荐算法-->
  80. <dependency>
  81. <groupId>org.apache.spark</groupId>
  82. <artifactId>spark-mllib_2.12</artifactId>
  83. <version>${spark.version}</version>
  84. </dependency>
  85. <dependency>
  86. <groupId>org.apache.hadoop</groupId>
  87. <artifactId>hadoop-client</artifactId>
  88. <version>2.7.5</version>
  89. </dependency>
  90. <dependency>
  91. <groupId>com.hankcs</groupId>
  92. <artifactId>hanlp</artifactId>
  93. <version>portable-1.7.7</version>
  94. </dependency>
  95. <dependency>
  96. <groupId>mysql</groupId>
  97. <artifactId>mysql-connector-java</artifactId>
  98. <version>8.0.23</version>
  99. </dependency>
  100. <dependency>
  101. <groupId>redis.clients</groupId>
  102. <artifactId>jedis</artifactId>
  103. <version>2.9.0</version>
  104. </dependency>
  105. <dependency>
  106. <groupId>com.alibaba</groupId>
  107. <artifactId>fastjson</artifactId>
  108. <version>1.2.47</version>
  109. </dependency>
  110. <dependency>
  111. <groupId>org.projectlombok</groupId>
  112. <artifactId>lombok</artifactId>
  113. <version>1.18.2</version>
  114. <scope>provided</scope>
  115. </dependency>
  116. </dependencies>
  117. <build>
  118. <sourceDirectory>src/main/scala</sourceDirectory>
  119. <plugins>
  120. <!-- 指定编译java的插件 -->
  121. <plugin>
  122. <groupId>org.apache.maven.plugins</groupId>
  123. <artifactId>maven-compiler-plugin</artifactId>
  124. <version>3.5.1</version>
  125. </plugin>
  126. <!-- 指定编译scala的插件 -->
  127. <plugin>
  128. <groupId>net.alchim31.maven</groupId>
  129. <artifactId>scala-maven-plugin</artifactId>
  130. <version>3.2.2</version>
  131. <executions>
  132. <execution>
  133. <goals>
  134. <goal>compile</goal>
  135. <goal>testCompile</goal>
  136. </goals>
  137. <configuration>
  138. <args>
  139. <arg>-dependencyfile</arg>
  140. <arg>${project.build.directory}/.scala_dependencies</arg>
  141. </args>
  142. </configuration>
  143. </execution>
  144. </executions>
  145. </plugin>
  146. <plugin>
  147. <groupId>org.apache.maven.plugins</groupId>
  148. <artifactId>maven-surefire-plugin</artifactId>
  149. <version>2.18.1</version>
  150. <configuration>
  151. <useFile>false</useFile>
  152. <disableXmlReport>true</disableXmlReport>
  153. <includes>
  154. <include>**/*Test.*</include>
  155. <include>**/*Suite.*</include>
  156. </includes>
  157. </configuration>
  158. </plugin>
  159. <plugin>
  160. <groupId>org.apache.maven.plugins</groupId>
  161. <artifactId>maven-shade-plugin</artifactId>
  162. <version>2.3</version>
  163. <executions>
  164. <execution>
  165. <phase>package</phase>
  166. <goals>
  167. <goal>shade</goal>
  168. </goals>
  169. <configuration>
  170. <filters>
  171. <filter>
  172. <artifact>*:*</artifact>
  173. <excludes>
  174. <exclude>META-INF/*.SF</exclude>
  175. <exclude>META-INF/*.DSA</exclude>
  176. <exclude>META-INF/*.RSA</exclude>
  177. </excludes>
  178. </filter>
  179. </filters>
  180. <transformers>
  181. <transformer
  182. implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
  183. <mainClass></mainClass>
  184. </transformer>
  185. </transformers>
  186. </configuration>
  187. </execution>
  188. </executions>
  189. </plugin>
  190. </plugins>
  191. </build>
  192. </project>

        注:pom依赖在业务实施中是极其重要的一环,相当于配置文件,例如可能需要的 jar 包,可能需要的 Scala 语言版本都在此处进行配置 等等

创建数据库


  
  1. CREATE TABLE `data` (
  2. `id` int(11) NOT NULL AUTO_INCREMENT,
  3. `name` varchar(255) DEFAULT NULL,
  4. `age` int(11) DEFAULT NULL,
  5. PRIMARY KEY (`id`)
  6. ) ENGINE=InnoDB DEFAULT CHARSET=utf8;

业务逻辑

1、创建本地环境,并设置日志提示级别


   
  1. val conf: SparkConf = new SparkConf().setAppName("spark").setMaster("local[*]")
  2. val sc: SparkContext = new SparkContext(conf)
  3. sc.setLogLevel("WARN")

2、加载数据,创建RDD

val dataRDD: RDD[(String, Int)] = sc.makeRDD(List(("tuomasi", 21), ("孙悟空", 19), ("猪八戒", 20)))

  

3、分区迭代


   
  1. dataRDD.foreachPartition(iter => {
  2. })

4、加载驱动

val conn: Connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8", "root", "123456")

  

5、封装SQL语句


   
  1. val sql: String = "INSERT INTO `data` (`id`, `name`, `age`) VALUES (NULL, ?, ?);"
  2. val ps: PreparedStatement = conn.prepareStatement(sql)

6、数据处理


   
  1. iter.foreach(t => { //t就表示每一条数据
  2. val name: String = t._1
  3. val age: Int = t._2
  4. ps.setString(1, name)
  5. ps.setInt(2, age)
  6. ps.addBatch()
  7. })
  8. ps.executeBatch()

7、关闭连接


   
  1. if (conn != null) conn.close()
  2. if (ps != null) ps.close()

8、读取数据库

val getConnection = () => DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8", "root", "123456")

  

9、SQL语句上下界设定以及分区数设置


   
  1. val studentTupleRDD: JdbcRDD[(Int, String, Int)] = new JdbcRDD[(Int, String, Int)](
  2. sc,
  3. getConnection,
  4. sql,
  5. 1, //id为1~20之间的记录进行提取
  6. 20,
  7. 1,
  8. mapRow
  9. )

10、结果集处理函数


   
  1. val mapRow: ResultSet => (Int, String, Int) = (r: ResultSet) => {
  2. val id: Int = r.getInt("id")
  3. val name: String = r.getString("name")
  4. val age: Int = r.getInt("age")
  5. (id, name, age)
  6. }

11、遍历打印数据

studentTupleRDD.foreach(println)
  

完整代码


   
  1. package org.example.spark
  2. import java.sql.{Connection, DriverManager, PreparedStatement, ResultSet}
  3. import org.apache.spark.rdd.{JdbcRDD, RDD}
  4. import org.apache.spark.{SparkConf, SparkContext}
  5. object RDD_DataSource {
  6. def main(args: Array[String]): Unit = {
  7. //TODO 0.env/创建环境
  8. val conf: SparkConf = new SparkConf().setAppName("spark").setMaster("local[*]")
  9. val sc: SparkContext = new SparkContext(conf)
  10. sc.setLogLevel("WARN")
  11. //TODO 1.source/加载数据/创建RDD
  12. //RDD[(姓名, 年龄)]
  13. val dataRDD: RDD[(String, Int)] = sc.makeRDD(List(("tuomasi", 21), ("孙悟空", 19), ("猪八戒", 20)))
  14. //TODO 2.transformation
  15. //TODO 3.sink/输出
  16. //需求:将数据写入到MySQL,再从MySQL读出来
  17. dataRDD.foreachPartition(iter => {
  18. //加载驱动
  19. val conn: Connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8", "root", "123456")
  20. val sql: String = "INSERT INTO `data` (`id`, `name`, `age`) VALUES (NULL, ?, ?);"
  21. val ps: PreparedStatement = conn.prepareStatement(sql)
  22. iter.foreach(t => { //t就表示每一条数据
  23. val name: String = t._1
  24. val age: Int = t._2
  25. ps.setString(1, name)
  26. ps.setInt(2, age)
  27. ps.addBatch()
  28. //ps.executeUpdate()
  29. })
  30. ps.executeBatch()
  31. //关闭连接
  32. if (conn != null) conn.close()
  33. if (ps != null) ps.close()
  34. })
  35. // //从MySQL读取
  36. val getConnection = () => DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata?characterEncoding=UTF-8", "root", "123456")
  37. val sql: String = "select id,name,age from data where id >= ? and id <= ?"
  38. val mapRow: ResultSet => (Int, String, Int) = (r: ResultSet) => {
  39. val id: Int = r.getInt("id")
  40. val name: String = r.getString("name")
  41. val age: Int = r.getInt("age")
  42. (id, name, age)
  43. }
  44. val studentTupleRDD: JdbcRDD[(Int, String, Int)] = new JdbcRDD[(Int, String, Int)](
  45. sc,
  46. getConnection,
  47. sql,
  48. 1,
  49. 20,
  50. 1,
  51. mapRow
  52. )
  53. studentTupleRDD.foreach(println)
  54. }
  55. }

程序运行

控制台打印

 数据库查看

         注:此为实验案例,在真实的场景中往往数据都是数以万计级别或者更多,优秀的代码往往体现在数据量极大的场景下,调优不失为一种升职加薪的必备技能

项目总结

        总结:在代码编写过程中,难免出现知识匮乏,在遇到问题时,养成多看源码的好习惯,在以后的开发书写过程中会有事半功倍的效果,当然日志,及其 debug 的作用在开发中也不容小觑。

文章来源: tuomasi.blog.csdn.net,作者:托马斯-酷涛,版权归原作者所有,如需转载,请联系作者。

原文链接:tuomasi.blog.csdn.net/article/details/122579557

【版权声明】本文为华为云社区用户转载文章,如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱: cloudbbs@huaweicloud.com
  • 点赞
  • 收藏
  • 关注作者

评论(0

0/1000
抱歉,系统识别当前为高风险访问,暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称,即可参与社区互动!

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。