相关的资源文件地址
链接:https://pan.baidu.com/s/1QGQIrVwg56g9eF16ERSLwQ
提取码:7v8n
json文件读写示例
import org.apache.spark.sql.*;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;import java.util.*;public class test8 {public static void main(String[] args) {SparkSession spark = SparkSession.builder().config("spark.driver.host", "localhost").appName("JsonFileTest").master("local").getOrCreate();spark.sparkContext().setLogLevel("ERROR");//将parquet文件数据转化成json文件数据Dataset<Row> sessionDf = spark.read().parquet(Utils.BASE_PATH + "/trackerSession");sessionDf.show(false);
//+------------------------------------+-------------------+-------+------------+---------+----------------------------------+--------------+-----------+---------------+------------+
//|session_id |session_server_time|cookie |cookie_label|ip |landing_url |pageview_count|click_count|domain |domain_label|
//+------------------------------------+-------------------+-------+------------+---------+----------------------------------+--------------+-----------+---------------+------------+
//|520815c9-bdd4-40c5-9ffa-df491dcd97e1|2017-09-04 12:00:00|cookie1|固执 |127.0.0.3|https://www.baidu.com |1 |2 |www.baidu.com |level1 |
//|912a4b47-6984-4763-a704-699ee9724585|2017-09-04 12:45:01|cookie1|固执 |127.0.0.3|https://tieba.baidu.com/index.html|1 |2 |tieba.baidu.com|- |
//|79534f7c-b4dc-4bc6-b021-c05d5ceb634c|2017-09-04 12:00:01|cookie2|有偏见 |127.0.0.4|https://www.baidu.com |3 |1 |www.baidu.com |level1 |
// +------------------------------------+-------------------+-------+------------+---------+----------------------------------+--------------+-----------+---------------+------------+sessionDf.write().mode(SaveMode.Overwrite).json(Utils.BASE_PATH + "/json");//读取json文件数据Dataset<Row> jsonDF = spark.read().json(Utils.BASE_PATH + "/json");jsonDF.show(false);
//+-----------+-------+------------+---------------+------------+---------+----------------------------------+--------------+------------------------------------+-------------------+
//|click_count|cookie |cookie_label|domain |domain_label|ip |landing_url |pageview_count|session_id |session_server_time|
//+-----------+-------+------------+---------------+------------+---------+----------------------------------+--------------+------------------------------------+-------------------+
//|2 |cookie1|固执 |www.baidu.com |level1 |127.0.0.3|https://www.baidu.com |1 |520815c9-bdd4-40c5-9ffa-df491dcd97e1|2017-09-04 12:00:00|
//|2 |cookie1|固执 |tieba.baidu.com|- |127.0.0.3|https://tieba.baidu.com/index.html|1 |912a4b47-6984-4763-a704-699ee9724585|2017-09-04 12:45:01|
//|1 |cookie2|有偏见 |www.baidu.com |level1 |127.0.0.4|https://www.baidu.com |3 |79534f7c-b4dc-4bc6-b021-c05d5ceb634c|2017-09-04 12:00:01|
//+-----------+-------+------------+---------------+------------+---------+----------------------------------+--------------+------------------------------------+-------------------+//可以从JSON Dataset(类型为String)中创建一个DFList<String> jsonList =Arrays.asList("{\"name\":\"Yin\",\"address\":{\"is_old\":true,\"area\":23000.34}}");Dataset<String> jsonDataset = spark.createDataset(jsonList, Encoders.STRING());jsonDataset.show(false);
// +--------------------------------------------------------+
// |value |
// +--------------------------------------------------------+
// |{"name":"Yin","address":{"is_old":true,"area":23000.34}}|
// +--------------------------------------------------------+//通过Dataset<String>读取jsonDataset<Row> jsonDFFromDS = spark.read().json(jsonDataset);jsonDFFromDS.show(false);
// +---------------+----+
// |address |name|
// +---------------+----+
// |[23000.34,true]|Yin |
// +---------------+----+//读参数的设置Map<String, String> readOpts = new HashMap<>();//将所有 原始类型 推断为 字符串类型readOpts.put("primitivesAsString", "true");//忽略JSON记录中的Java / C ++样式注释readOpts.put("allowComments", "true");//通过Dataset<String>读取jsonjsonDFFromDS = spark.read().options(readOpts).json(jsonDataset);jsonDFFromDS.show(false);
// +---------------+----+
// |address |name|
// +---------------+----+
// |[23000.34,true]|Yin |
// +---------------+----+//写参数的设置Map<String, String> writeOpts = new HashMap<>();//保存到文件时使用的压缩编解码器writeOpts.put("compression", "gzip");writeOpts.put("dateFormat", "yyyy/MM/dd");List<StructField> fields = new ArrayList<>();StructField name = DataTypes.createStructField("name", DataTypes.StringType, true);StructField date = DataTypes.createStructField("date", DataTypes.DateType, true);fields.add(name);fields.add(date);StructType customSchema = DataTypes.createStructType(fields);List<String> dateJsonList = Arrays.asList("{'name':'Yin','date':'26/08/2015 18:00'}");Dataset<String> dateJsonDataset = spark.createDataset(dateJsonList, Encoders.STRING());dateJsonDataset.show(false);
// +----------------------------------------+
// |value |
// +----------------------------------------+
// |{'name':'Yin','date':'26/08/2015 18:00'}|
// +----------------------------------------+//通过Dataset<String>读取json,同时带有schema信息 和 读取的参数相关Dataset<Row> dateJsonDFFromDS =spark.read().schema(customSchema).option("dateFormat", "dd/MM/yyyy HH:mm").json(dateJsonDataset);dateJsonDFFromDS.write().mode(SaveMode.Overwrite).options(writeOpts).json(Utils.BASE_PATH + "/json_date");spark.read().json(Utils.BASE_PATH + "/json_date").show(false);
// +----------+----+
// |date |name|
// +----------+----+
// |2015/08/26|Yin |
// +----------+----+spark.stop();}
}