大数据应用之 — apache doris 基于ssb测试

  1. 下载doris的ssb-tools

    https://github.com/apache/doris

    将 doris-master\tools\ssb-tools 上传到lsyk01:/softw

  2. 下载 ssb-gen工具包(因为虚拟机没有联网)

    https://palo-cloud-repo-bd.bd.bcebos.com/baidu-doris-release/ssb-dbgen-linux.tar.gz

​ 上传至 lsyk01:/softw/ssb-tools

  1. 修改脚本 /softw/ssb-tools/build-ssb-dbgen.sh
vi /softw/ssb-tools/build-ssb-dbgen.sh
#修改如下:不去下载了,直接解压下载好的包
# download ssb-dbgen first
if [[ -d $SSB_DBGEN_DIR ]]; then
    echo "Dir $CURDIR/ssb-dbgen/ already exists. No need to download."
    echo "If you want to download ssb-dbgen again, please delete this dir first."
else
    #curl https://palo-cloud-repo-bd.bd.bcebos.com/baidu-doris-release/ssb-dbgen-linux.tar.gz | tar xz -C $CURDIR/
    tar -zxvf $CURDIR/ssb-dbgen-linux.tar.gz -C $CURDIR/
fi
  1. 编译ssb-gen
cd /softw/ssb-tools
sh build-ssb-dbgen.sh
  1. 生成测试数据
sh gen-ssb-data.sh -s 40

du -sh *
110M    customer.tbl
228K    date.tbl
2.4G    lineorder.tbl.1
2.4G    lineorder.tbl.10
2.4G    lineorder.tbl.2
2.4G    lineorder.tbl.3
2.4G    lineorder.tbl.4
2.4G    lineorder.tbl.5
2.4G    lineorder.tbl.6
2.4G    lineorder.tbl.7
2.4G    lineorder.tbl.8
2.4G    lineorder.tbl.9
99M     part.tbl
6.5M    supplier.tbl


wc -l *
    1200000 customer.tbl
       2556 date.tbl
   23996604 lineorder.tbl.1
   24001837 lineorder.tbl.10
   23992403 lineorder.tbl.2
   23996070 lineorder.tbl.3
   24003563 lineorder.tbl.4
   24005968 lineorder.tbl.5
   24005179 lineorder.tbl.6
   23998304 lineorder.tbl.7
   24002460 lineorder.tbl.8
   24009902 lineorder.tbl.9
    1200000 part.tbl
      80000 supplier.tbl
  1. 配置 doris-cluster.conf
# Any of FE host
export FE_HOST='lsyk01'
# http_port in fe.conf
export FE_HTTP_PORT=8030
# query_port in fe.conf
export FE_QUERY_PORT=9030
# Doris username
export USER='root'
# Doris password
export PASSWORD='fa'
# The database where SSB tables located
export DB='ssb'
  1. 建表
sh ./create-ssb-tables.sh
sh ./create-ssb-flat-table.sh
  1. 导入数据
sh ./load-ssb-dimension-data.sh

sh ./load-ssb-fact-data.sh -c 5

​ 很吃内存:

在这里插入图片描述

用时 8分钟,大小大概6.8G,原文件是:24G

mysql> select count(1) from ssb.lineorder

在这里插入图片描述

在这里插入图片描述
由此可见,apache doris 的缓存了得啊。。。

  1. 导入flat宽表
sh ./load-ssb-flat-data.sh

报错:

在这里插入图片描述

查看代码,发现没有指定密码:

在这里插入图片描述

增加-p密码

在这里插入图片描述

耗时25分钟,还报错了,是内存不足了:

在这里插入图片描述

语句拿出来,半年一次,100秒,比官方的脚本快

挂了

在这里插入图片描述

INSERT INTO ssb.lineorder_flat
SELECT
    LO_ORDERDATE,
    LO_ORDERKEY,
    LO_LINENUMBER,
    LO_CUSTKEY,
    LO_PARTKEY,
    LO_SUPPKEY,
    LO_ORDERPRIORITY,
    LO_SHIPPRIORITY,
    LO_QUANTITY,
    LO_EXTENDEDPRICE,
    LO_ORDTOTALPRICE,
    LO_DISCOUNT,
    LO_REVENUE,
    LO_SUPPLYCOST,
    LO_TAX,
    LO_COMMITDATE,
    LO_SHIPMODE,
    C_NAME,
    C_ADDRESS,
    C_CITY,
    C_NATION,
    C_REGION,
    C_PHONE,
    C_MKTSEGMENT,
    S_NAME,
    S_ADDRESS,
    S_CITY,
    S_NATION,
    S_REGION,
    S_PHONE,
    P_NAME,
    P_MFGR,
    P_CATEGORY,
    P_BRAND,
    P_COLOR,
    P_TYPE,
    P_SIZE,
    P_CONTAINER
FROM (
    SELECT
        lo_orderkey,
        lo_linenumber,
        lo_custkey,
        lo_partkey,
        lo_suppkey,
        lo_orderdate,
        lo_orderpriority,
        lo_shippriority,
        lo_quantity,
        lo_extendedprice,
        lo_ordtotalprice,
        lo_discount,
        lo_revenue,
        lo_supplycost,
        lo_tax,
        lo_commitdate,
        lo_shipmode
    FROM ssb.lineorder
    -- WHERE ${con}
) l
INNER JOIN ssb.customer c
ON (c.c_custkey = l.lo_custkey)
INNER JOIN ssb.supplier s
ON (s.s_suppkey = l.lo_suppkey)
INNER JOIN ssb.part p
ON (p.p_partkey = l.lo_partkey);
select 'part',count(*) from ssb.part union all
select 'customer',count(*) from ssb.customer union all
select 'supplier',count(*) from ssb.supplier union all
select 'date',count(*) from ssb.dates union all
select 'lineorder',count(*) from ssb.lineorder union all
select 'lineorder_flat',count(*) from ssb.lineorder_flat

在这里插入图片描述

  1. 测试结果
set global enable_vectorized_engine=1;
set global parallel_fragment_exec_instance_num=8;
set global exec_mem_limit=48G;
set global batch_size=4096;
set global enable_projection=true;
set global runtime_filter_mode=global;

--Q1.1   0.68
SELECT
	SUM(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue
FROM
	ssb.lineorder_flat
WHERE
	LO_ORDERDATE >= 19930101
	AND LO_ORDERDATE <= 19931231
	AND LO_DISCOUNT BETWEEN 1 AND 3
	AND LO_QUANTITY < 25;
	
--Q1.2    0.12
SELECT
	SUM(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue
FROM
	ssb.lineorder_flat
WHERE
	LO_ORDERDATE >= 19940101
	AND LO_ORDERDATE <= 19940131
	AND LO_DISCOUNT BETWEEN 4 AND 6
	AND LO_QUANTITY BETWEEN 26 AND 35;
	
--Q1.3   0.78
SELECT
	SUM(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue
FROM
	ssb.lineorder_flat
WHERE
	weekofyear(LO_ORDERDATE) = 6
	AND LO_ORDERDATE >= 19940101
	AND LO_ORDERDATE <= 19941231
	AND LO_DISCOUNT BETWEEN 5 AND 7
	AND LO_QUANTITY BETWEEN 26 AND 35;
	
--Q2.1   4.47
SELECT
	SUM(LO_REVENUE)
	,
	(LO_ORDERDATE DIV 10000) AS YEAR
	,
	P_BRAND
FROM
	ssb.lineorder_flat
WHERE
	P_CATEGORY = 'MFGR#12'
	AND S_REGION = 'AMERICA'
GROUP BY
	YEAR,
	P_BRAND
ORDER BY
	YEAR,
	P_BRAND;
	
--Q2.2    2.69
SELECT
	SUM(LO_REVENUE),
	(LO_ORDERDATE DIV 10000) AS YEAR,
	P_BRAND
FROM
	ssb.lineorder_flat
WHERE
	P_BRAND >= 'MFGR#2221'
	AND P_BRAND <= 'MFGR#2228'
	AND S_REGION = 'ASIA'
GROUP BY
	YEAR,
	P_BRAND
ORDER BY
	YEAR,
	P_BRAND;
	
--Q2.3   2.07
SELECT
	SUM(LO_REVENUE),
	(LO_ORDERDATE DIV 10000) AS YEAR,
	P_BRAND
FROM
	ssb.lineorder_flat
WHERE
	P_BRAND = 'MFGR#2239'
	AND S_REGION = 'EUROPE'
GROUP BY
	YEAR,
	P_BRAND
ORDER BY
	YEAR,
	P_BRAND;
	
--Q3.1   4.10
SELECT
	C_NATION,
	S_NATION,
	(LO_ORDERDATE DIV 10000) AS YEAR,
	SUM(LO_REVENUE) AS revenue
FROM
	ssb.lineorder_flat
WHERE
	C_REGION = 'ASIA'
	AND S_REGION = 'ASIA'
	AND LO_ORDERDATE >= 19920101
	AND LO_ORDERDATE <= 19971231
GROUP BY
	C_NATION,
	S_NATION,
	YEAR
ORDER BY
	YEAR ASC,
	revenue DESC;
	
--Q3.2   3.99
SELECT
	C_CITY,
	S_CITY,
	(LO_ORDERDATE DIV 10000) AS YEAR,
	SUM(LO_REVENUE) AS revenue
FROM
	ssb.lineorder_flat
WHERE
	C_NATION = 'UNITED STATES'
	AND S_NATION = 'UNITED STATES'
	AND LO_ORDERDATE >= 19920101
	AND LO_ORDERDATE <= 19971231
GROUP BY
	C_CITY,
	S_CITY,
	YEAR
ORDER BY
	YEAR ASC,
	revenue DESC;
	
--Q3.3   1.76
SELECT
	C_CITY,
	S_CITY,
	(LO_ORDERDATE DIV 10000) AS YEAR,
	SUM(LO_REVENUE) AS revenue
FROM
	ssb.lineorder_flat
WHERE
	C_CITY IN ('UNITED KI1', 'UNITED KI5')
	AND S_CITY IN ('UNITED KI1', 'UNITED KI5')
	AND LO_ORDERDATE >= 19920101
	AND LO_ORDERDATE <= 19971231
GROUP BY
	C_CITY,
	S_CITY,
	YEAR
ORDER BY
	YEAR ASC,
	revenue DESC;
	
--Q3.4   0.1
SELECT
	C_CITY,
	S_CITY,
	(LO_ORDERDATE DIV 10000) AS YEAR,
	SUM(LO_REVENUE) AS revenue
FROM
	ssb.lineorder_flat
WHERE
	C_CITY IN ('UNITED KI1', 'UNITED KI5')
	AND S_CITY IN ('UNITED KI1', 'UNITED KI5')
	AND LO_ORDERDATE >= 19971201
	AND LO_ORDERDATE <= 19971231
GROUP BY
	C_CITY,
	S_CITY,
	YEAR
ORDER BY
	YEAR ASC,
	revenue DESC;
	
--Q4.1   5.97
SELECT
	(LO_ORDERDATE DIV 10000) AS YEAR,
	C_NATION,
	SUM(LO_REVENUE - LO_SUPPLYCOST) AS profit
FROM
	ssb.lineorder_flat
WHERE
	C_REGION = 'AMERICA'
	aND S_REGION = 'AMERICA'
	AND P_MFGR IN ('MFGR#1', 'MFGR#2')
GROUP BY
	YEAR,
	C_NATION
ORDER BY
	YEAR ASC,
	C_NATION ASC;
	
--Q4.2   1.48
SELECT
	(LO_ORDERDATE DIV 10000) AS YEAR,
	S_NATION,
	P_CATEGORY,
	SUM(LO_REVENUE - LO_SUPPLYCOST) AS profit
FROM
	ssb.lineorder_flat
WHERE
	C_REGION = 'AMERICA'
	AND S_REGION = 'AMERICA'
	AND LO_ORDERDATE >= 19970101
	AND LO_ORDERDATE <= 19981231
	AND P_MFGR IN ('MFGR#1', 'MFGR#2')
GROUP BY
	YEAR,
	S_NATION,
	P_CATEGORY
ORDER BY
	YEAR ASC,
	S_NATION ASC,
	P_CATEGORY ASC;
	
--Q4.3   1.13
SELECT
	(LO_ORDERDATE DIV 10000) AS YEAR,
	S_CITY,
	P_BRAND,
	SUM(LO_REVENUE - LO_SUPPLYCOST) AS profit
FROM
	ssb.lineorder_flat
WHERE
	S_NATION = 'UNITED STATES'
	AND LO_ORDERDATE >= 19970101
	AND LO_ORDERDATE <= 19981231
	AND P_CATEGORY = 'MFGR#14'
GROUP BY
	YEAR,
	S_CITY,
	P_BRAND
ORDER BY
	YEAR ASC,
	S_CITY ASC,
	P_BRAND ASC;
	
--Q5.1   58.79
select
	count(1),
	sum(cnt)
from
	(
	select
		LO_ORDERPRIORITY,
		LO_SHIPMODE,
		P_COLOR,
		P_BRAND,
		count(1) as cnt,
		sum(LO_SUPPLYCOST)
	from
		ssb.lineorder_flat
	group by
		LO_ORDERPRIORITY,
		LO_SHIPMODE,
		P_COLOR,
		P_BRAND
	order by
		LO_ORDERPRIORITY,
		LO_SHIPMODE,
		P_COLOR,
		P_BRAND
) t ;
--3218808  240012290

--Q5.2  5.43 
select
	count(1),
	sum(cnt)
from
	(
	select
		LO_ORDERPRIORITY,
		LO_SHIPMODE,
		P_COLOR,
		P_BRAND,
		count(1) as cnt,
		sum(LO_SUPPLYCOST)
	from
		ssb.lineorder_flat
	where
		S_NATION = 'UNITED STATES'
		AND P_CATEGORY = 'MFGR#14'
	group by
		LO_ORDERPRIORITY,
		LO_SHIPMODE,
		P_COLOR,
		P_BRAND
	order by
		LO_ORDERPRIORITY,
		LO_SHIPMODE,
		P_COLOR,
		P_BRAND
) t ;
--117571

--Q6.1   58.79
select
	count(1),
	sum(cnt)
from
	(
	select
		LO_ORDERPRIORITY,
		LO_SHIPMODE,
		P_COLOR,
		P_BRAND,
		count(1) as cnt,
		sum(LO_SUPPLYCOST) as sm,
		count(distinct S_NAME) as dcnt
	from
		ssb.lineorder_flat
	group by
		LO_ORDERPRIORITY,
		LO_SHIPMODE,
		P_COLOR,
		P_BRAND
	order by
		LO_ORDERPRIORITY,
		LO_SHIPMODE,
		P_COLOR,
		P_BRAND
) t ;
--报错,内存不足

--Q6.2  10.81 
select
	count(1),
	sum(cnt)
from
	(
	select
		LO_ORDERPRIORITY,
		LO_SHIPMODE,
		P_COLOR,
		P_BRAND,
		count(1) as cnt,
		sum(LO_SUPPLYCOST) as sm,
		count(distinct S_NAME) as dcnt
	from
		ssb.lineorder_flat
	where
		S_NATION = 'UNITED STATES'
		AND P_CATEGORY = 'MFGR#14'
	group by
		LO_ORDERPRIORITY,
		LO_SHIPMODE,
		P_COLOR,
		P_BRAND
	order by
		LO_ORDERPRIORITY,
		LO_SHIPMODE,
		P_COLOR,
		P_BRAND
) t ;
--117571 386092

Logo

技术共进,成长同行——讯飞AI开发者社区

更多推荐