大数据应用之 --- apache doris 基于ssb测试
大数据应用之 --- apache doris 基于ssb测试
·
大数据应用之 — apache doris 基于ssb测试
-
下载doris的ssb-tools
https://github.com/apache/doris
将 doris-master\tools\ssb-tools 上传到lsyk01:/softw
-
下载 ssb-gen工具包(因为虚拟机没有联网)
https://palo-cloud-repo-bd.bd.bcebos.com/baidu-doris-release/ssb-dbgen-linux.tar.gz
上传至 lsyk01:/softw/ssb-tools
- 修改脚本 /softw/ssb-tools/build-ssb-dbgen.sh
vi /softw/ssb-tools/build-ssb-dbgen.sh
#修改如下:不去下载了,直接解压下载好的包
# download ssb-dbgen first
if [[ -d $SSB_DBGEN_DIR ]]; then
echo "Dir $CURDIR/ssb-dbgen/ already exists. No need to download."
echo "If you want to download ssb-dbgen again, please delete this dir first."
else
#curl https://palo-cloud-repo-bd.bd.bcebos.com/baidu-doris-release/ssb-dbgen-linux.tar.gz | tar xz -C $CURDIR/
tar -zxvf $CURDIR/ssb-dbgen-linux.tar.gz -C $CURDIR/
fi
- 编译ssb-gen
cd /softw/ssb-tools
sh build-ssb-dbgen.sh
- 生成测试数据
sh gen-ssb-data.sh -s 40
du -sh *
110M customer.tbl
228K date.tbl
2.4G lineorder.tbl.1
2.4G lineorder.tbl.10
2.4G lineorder.tbl.2
2.4G lineorder.tbl.3
2.4G lineorder.tbl.4
2.4G lineorder.tbl.5
2.4G lineorder.tbl.6
2.4G lineorder.tbl.7
2.4G lineorder.tbl.8
2.4G lineorder.tbl.9
99M part.tbl
6.5M supplier.tbl
wc -l *
1200000 customer.tbl
2556 date.tbl
23996604 lineorder.tbl.1
24001837 lineorder.tbl.10
23992403 lineorder.tbl.2
23996070 lineorder.tbl.3
24003563 lineorder.tbl.4
24005968 lineorder.tbl.5
24005179 lineorder.tbl.6
23998304 lineorder.tbl.7
24002460 lineorder.tbl.8
24009902 lineorder.tbl.9
1200000 part.tbl
80000 supplier.tbl
- 配置 doris-cluster.conf
# Any of FE host
export FE_HOST='lsyk01'
# http_port in fe.conf
export FE_HTTP_PORT=8030
# query_port in fe.conf
export FE_QUERY_PORT=9030
# Doris username
export USER='root'
# Doris password
export PASSWORD='fa'
# The database where SSB tables located
export DB='ssb'
- 建表
sh ./create-ssb-tables.sh
sh ./create-ssb-flat-table.sh
- 导入数据
sh ./load-ssb-dimension-data.sh
sh ./load-ssb-fact-data.sh -c 5
很吃内存:
用时 8分钟,大小大概6.8G,原文件是:24G
mysql> select count(1) from ssb.lineorder
由此可见,apache doris 的缓存了得啊。。。
- 导入flat宽表
sh ./load-ssb-flat-data.sh
报错:
查看代码,发现没有指定密码:
增加-p密码
耗时25分钟,还报错了,是内存不足了:
语句拿出来,半年一次,100秒,比官方的脚本快
挂了
INSERT INTO ssb.lineorder_flat
SELECT
LO_ORDERDATE,
LO_ORDERKEY,
LO_LINENUMBER,
LO_CUSTKEY,
LO_PARTKEY,
LO_SUPPKEY,
LO_ORDERPRIORITY,
LO_SHIPPRIORITY,
LO_QUANTITY,
LO_EXTENDEDPRICE,
LO_ORDTOTALPRICE,
LO_DISCOUNT,
LO_REVENUE,
LO_SUPPLYCOST,
LO_TAX,
LO_COMMITDATE,
LO_SHIPMODE,
C_NAME,
C_ADDRESS,
C_CITY,
C_NATION,
C_REGION,
C_PHONE,
C_MKTSEGMENT,
S_NAME,
S_ADDRESS,
S_CITY,
S_NATION,
S_REGION,
S_PHONE,
P_NAME,
P_MFGR,
P_CATEGORY,
P_BRAND,
P_COLOR,
P_TYPE,
P_SIZE,
P_CONTAINER
FROM (
SELECT
lo_orderkey,
lo_linenumber,
lo_custkey,
lo_partkey,
lo_suppkey,
lo_orderdate,
lo_orderpriority,
lo_shippriority,
lo_quantity,
lo_extendedprice,
lo_ordtotalprice,
lo_discount,
lo_revenue,
lo_supplycost,
lo_tax,
lo_commitdate,
lo_shipmode
FROM ssb.lineorder
-- WHERE ${con}
) l
INNER JOIN ssb.customer c
ON (c.c_custkey = l.lo_custkey)
INNER JOIN ssb.supplier s
ON (s.s_suppkey = l.lo_suppkey)
INNER JOIN ssb.part p
ON (p.p_partkey = l.lo_partkey);
select 'part',count(*) from ssb.part union all
select 'customer',count(*) from ssb.customer union all
select 'supplier',count(*) from ssb.supplier union all
select 'date',count(*) from ssb.dates union all
select 'lineorder',count(*) from ssb.lineorder union all
select 'lineorder_flat',count(*) from ssb.lineorder_flat
- 测试结果
set global enable_vectorized_engine=1;
set global parallel_fragment_exec_instance_num=8;
set global exec_mem_limit=48G;
set global batch_size=4096;
set global enable_projection=true;
set global runtime_filter_mode=global;
--Q1.1 0.68
SELECT
SUM(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue
FROM
ssb.lineorder_flat
WHERE
LO_ORDERDATE >= 19930101
AND LO_ORDERDATE <= 19931231
AND LO_DISCOUNT BETWEEN 1 AND 3
AND LO_QUANTITY < 25;
--Q1.2 0.12
SELECT
SUM(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue
FROM
ssb.lineorder_flat
WHERE
LO_ORDERDATE >= 19940101
AND LO_ORDERDATE <= 19940131
AND LO_DISCOUNT BETWEEN 4 AND 6
AND LO_QUANTITY BETWEEN 26 AND 35;
--Q1.3 0.78
SELECT
SUM(LO_EXTENDEDPRICE * LO_DISCOUNT) AS revenue
FROM
ssb.lineorder_flat
WHERE
weekofyear(LO_ORDERDATE) = 6
AND LO_ORDERDATE >= 19940101
AND LO_ORDERDATE <= 19941231
AND LO_DISCOUNT BETWEEN 5 AND 7
AND LO_QUANTITY BETWEEN 26 AND 35;
--Q2.1 4.47
SELECT
SUM(LO_REVENUE)
,
(LO_ORDERDATE DIV 10000) AS YEAR
,
P_BRAND
FROM
ssb.lineorder_flat
WHERE
P_CATEGORY = 'MFGR#12'
AND S_REGION = 'AMERICA'
GROUP BY
YEAR,
P_BRAND
ORDER BY
YEAR,
P_BRAND;
--Q2.2 2.69
SELECT
SUM(LO_REVENUE),
(LO_ORDERDATE DIV 10000) AS YEAR,
P_BRAND
FROM
ssb.lineorder_flat
WHERE
P_BRAND >= 'MFGR#2221'
AND P_BRAND <= 'MFGR#2228'
AND S_REGION = 'ASIA'
GROUP BY
YEAR,
P_BRAND
ORDER BY
YEAR,
P_BRAND;
--Q2.3 2.07
SELECT
SUM(LO_REVENUE),
(LO_ORDERDATE DIV 10000) AS YEAR,
P_BRAND
FROM
ssb.lineorder_flat
WHERE
P_BRAND = 'MFGR#2239'
AND S_REGION = 'EUROPE'
GROUP BY
YEAR,
P_BRAND
ORDER BY
YEAR,
P_BRAND;
--Q3.1 4.10
SELECT
C_NATION,
S_NATION,
(LO_ORDERDATE DIV 10000) AS YEAR,
SUM(LO_REVENUE) AS revenue
FROM
ssb.lineorder_flat
WHERE
C_REGION = 'ASIA'
AND S_REGION = 'ASIA'
AND LO_ORDERDATE >= 19920101
AND LO_ORDERDATE <= 19971231
GROUP BY
C_NATION,
S_NATION,
YEAR
ORDER BY
YEAR ASC,
revenue DESC;
--Q3.2 3.99
SELECT
C_CITY,
S_CITY,
(LO_ORDERDATE DIV 10000) AS YEAR,
SUM(LO_REVENUE) AS revenue
FROM
ssb.lineorder_flat
WHERE
C_NATION = 'UNITED STATES'
AND S_NATION = 'UNITED STATES'
AND LO_ORDERDATE >= 19920101
AND LO_ORDERDATE <= 19971231
GROUP BY
C_CITY,
S_CITY,
YEAR
ORDER BY
YEAR ASC,
revenue DESC;
--Q3.3 1.76
SELECT
C_CITY,
S_CITY,
(LO_ORDERDATE DIV 10000) AS YEAR,
SUM(LO_REVENUE) AS revenue
FROM
ssb.lineorder_flat
WHERE
C_CITY IN ('UNITED KI1', 'UNITED KI5')
AND S_CITY IN ('UNITED KI1', 'UNITED KI5')
AND LO_ORDERDATE >= 19920101
AND LO_ORDERDATE <= 19971231
GROUP BY
C_CITY,
S_CITY,
YEAR
ORDER BY
YEAR ASC,
revenue DESC;
--Q3.4 0.1
SELECT
C_CITY,
S_CITY,
(LO_ORDERDATE DIV 10000) AS YEAR,
SUM(LO_REVENUE) AS revenue
FROM
ssb.lineorder_flat
WHERE
C_CITY IN ('UNITED KI1', 'UNITED KI5')
AND S_CITY IN ('UNITED KI1', 'UNITED KI5')
AND LO_ORDERDATE >= 19971201
AND LO_ORDERDATE <= 19971231
GROUP BY
C_CITY,
S_CITY,
YEAR
ORDER BY
YEAR ASC,
revenue DESC;
--Q4.1 5.97
SELECT
(LO_ORDERDATE DIV 10000) AS YEAR,
C_NATION,
SUM(LO_REVENUE - LO_SUPPLYCOST) AS profit
FROM
ssb.lineorder_flat
WHERE
C_REGION = 'AMERICA'
aND S_REGION = 'AMERICA'
AND P_MFGR IN ('MFGR#1', 'MFGR#2')
GROUP BY
YEAR,
C_NATION
ORDER BY
YEAR ASC,
C_NATION ASC;
--Q4.2 1.48
SELECT
(LO_ORDERDATE DIV 10000) AS YEAR,
S_NATION,
P_CATEGORY,
SUM(LO_REVENUE - LO_SUPPLYCOST) AS profit
FROM
ssb.lineorder_flat
WHERE
C_REGION = 'AMERICA'
AND S_REGION = 'AMERICA'
AND LO_ORDERDATE >= 19970101
AND LO_ORDERDATE <= 19981231
AND P_MFGR IN ('MFGR#1', 'MFGR#2')
GROUP BY
YEAR,
S_NATION,
P_CATEGORY
ORDER BY
YEAR ASC,
S_NATION ASC,
P_CATEGORY ASC;
--Q4.3 1.13
SELECT
(LO_ORDERDATE DIV 10000) AS YEAR,
S_CITY,
P_BRAND,
SUM(LO_REVENUE - LO_SUPPLYCOST) AS profit
FROM
ssb.lineorder_flat
WHERE
S_NATION = 'UNITED STATES'
AND LO_ORDERDATE >= 19970101
AND LO_ORDERDATE <= 19981231
AND P_CATEGORY = 'MFGR#14'
GROUP BY
YEAR,
S_CITY,
P_BRAND
ORDER BY
YEAR ASC,
S_CITY ASC,
P_BRAND ASC;
--Q5.1 58.79
select
count(1),
sum(cnt)
from
(
select
LO_ORDERPRIORITY,
LO_SHIPMODE,
P_COLOR,
P_BRAND,
count(1) as cnt,
sum(LO_SUPPLYCOST)
from
ssb.lineorder_flat
group by
LO_ORDERPRIORITY,
LO_SHIPMODE,
P_COLOR,
P_BRAND
order by
LO_ORDERPRIORITY,
LO_SHIPMODE,
P_COLOR,
P_BRAND
) t ;
--3218808 240012290
--Q5.2 5.43
select
count(1),
sum(cnt)
from
(
select
LO_ORDERPRIORITY,
LO_SHIPMODE,
P_COLOR,
P_BRAND,
count(1) as cnt,
sum(LO_SUPPLYCOST)
from
ssb.lineorder_flat
where
S_NATION = 'UNITED STATES'
AND P_CATEGORY = 'MFGR#14'
group by
LO_ORDERPRIORITY,
LO_SHIPMODE,
P_COLOR,
P_BRAND
order by
LO_ORDERPRIORITY,
LO_SHIPMODE,
P_COLOR,
P_BRAND
) t ;
--117571
--Q6.1 58.79
select
count(1),
sum(cnt)
from
(
select
LO_ORDERPRIORITY,
LO_SHIPMODE,
P_COLOR,
P_BRAND,
count(1) as cnt,
sum(LO_SUPPLYCOST) as sm,
count(distinct S_NAME) as dcnt
from
ssb.lineorder_flat
group by
LO_ORDERPRIORITY,
LO_SHIPMODE,
P_COLOR,
P_BRAND
order by
LO_ORDERPRIORITY,
LO_SHIPMODE,
P_COLOR,
P_BRAND
) t ;
--报错,内存不足
--Q6.2 10.81
select
count(1),
sum(cnt)
from
(
select
LO_ORDERPRIORITY,
LO_SHIPMODE,
P_COLOR,
P_BRAND,
count(1) as cnt,
sum(LO_SUPPLYCOST) as sm,
count(distinct S_NAME) as dcnt
from
ssb.lineorder_flat
where
S_NATION = 'UNITED STATES'
AND P_CATEGORY = 'MFGR#14'
group by
LO_ORDERPRIORITY,
LO_SHIPMODE,
P_COLOR,
P_BRAND
order by
LO_ORDERPRIORITY,
LO_SHIPMODE,
P_COLOR,
P_BRAND
) t ;
--117571 386092
更多推荐
所有评论(0)