# tpc **Repository Path**: openkitty/tpc ## Basic Information - **Project Name**: tpc - **Description**: No description available - **Primary Language**: Unknown - **License**: Apache-2.0 - **Default Branch**: master - **Homepage**: None - **GVP Project**: No ## Statistics - **Stars**: 0 - **Forks**: 0 - **Created**: 2024-01-29 - **Last Updated**: 2024-02-20 ## Categories & Tags **Categories**: Uncategorized **Tags**: None ## README # 下载 下载地址:https://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp # TPC-H ## 安装 ```shell WORKSPACE=${HOME}/tpc/tpch mkdir -p ${WORKSPACE} # 下载 cd ${WORKSPACE} rm -f TPC-H_Tools_v3.0.1.zip wget --no-check-certificate http://jiutian.dev.huawei.com/tools/benchmark/tools/TPC-H_Tools_v3.0.1.zip rm -rf TPC-H\ V3.0.1 unzip TPC-H_Tools_v3.0.1.zip rm -rf TPC-H_V3.0.1 mv TPC-H\ V3.0.1 TPC-H_V3.0.1 # 编译 # CFLAGS后增加参数:-DEOL_HANDLING # 去除行尾的"|"分隔符 cd ${WORKSPACE}/TPC-H_V3.0.1/dbgen cp -a makefile.suite makefile sed -i "s@^CC =\(.*\)@CC = gcc@g" makefile sed -i "s@^DATABASE=\(.*\)@DATABASE= ORACLE@g" makefile sed -i "s@^MACHINE =\(.*\)@MACHINE = LINUX@g" makefile sed -i "s@^WORKLOAD =\(.*\)@WORKLOAD = TPCH@g" makefile sed -i '/-D_FILE_OFFSET_BITS=64/ s/$/ -DEOL_HANDLING/' makefile cat makefile|egrep "^CC =|^DATABASE=|^MACHINE =|^WORKLOAD =|-D_FILE_OFFSET_BITS=64" make -j$(proc) # 拷贝二进制 mkdir -p ${WORKSPACE}/data cd ${WORKSPACE}/data cp -a ${WORKSPACE}/TPC-H_V3.0.1/dbgen/dbgen ./ cp -a ${WORKSPACE}/TPC-H_V3.0.1/dbgen/dists.dss ./ ``` ## 生成数据 ```shell WORKSPACE=${HOME}/tpc/tpch # 生成数据1: # 总共8张表(.tbl) # -s: 比例因子(单位GB) scale=1 cd ${WORKSPACE}/data ./dbgen -s ${scale} -f # 移动至目录 mkdir -p ${WORKSPACE}/data/SF1/tbl mv ${WORKSPACE}/data/*.tbl ${WORKSPACE}/data/SF1/tbl/ # 生成数据2: # 切片(分割成3个文件) # -C:一共分成几个chunk # -S:当前命令生成第几个chunk #scale=1 #chunks=3 #cd ${WORKSPACE}/data #for ((i=1; i<=chunks; i++)); do # ./dbgen -s ${scale} -C ${chunks} -S ${i} -f & #done #wait # 删除每行最后一个字符"|" #cd ${WORKSPACE}/data #for file in $(ls *.tbl); do # sed -i 's/.$//' ${file} #done ``` ## to-parquet 安装依赖: ```shell # 依赖组件 pip3 install pandas \ pyarrow \ joblib \ filesplit ``` 转换成parquet: ```shell # 前提条件:拷贝转换脚本 # http://jiutian.dev.huawei.com/tools/benchmark/pytools/tpch/ WORKSPACE=${HOME}/tpc/tpch input_path="${WORKSPACE}/data/SF1/tbl" output_path="${WORKSPACE}/data/SF1/parquet" scale=1 partition_size=$((1024 * 1024 * 128)) # 文件切片大小,默认值128M可以不指定 parallel_num=16 # 利用python多进程进行数据处理的并行度,默认值16可以不指定 # 将原始数据集tbl文件进行切片并转换成parquet格式 python3 generation.py \ --dataset_save_type=parquet \ --scale_size=${scale} \ --tbl_file_path=${input_path} \ --dataset_save_path=${output_path} ``` # TPC-DS ## 安装 安装编译依赖: ```shell # 编译依赖 yum -y install gcc \ make \ flex \ byacc \ bison ``` 编译安装: ```shell WORKSPACE=${HOME}/tpc/tpcds mkdir -p ${WORKSPACE} # 下载 cd ${WORKSPACE} rm -f TPC-DS_Tools_v3.2.0.zip wget --no-check-certificate http://jiutian.dev.huawei.com/tools/benchmark/tools/TPC-DS_Tools_v3.2.0.zip rm -rf DSGen-software-code-3.2.0rc1 unzip TPC-DS_Tools_v3.2.0.zip # 编译(使用gcc-10.x.x版本编译有问题,暂未解决) # https://stackoverflow.com/questions/69908418/multiple-definition-of-first-defined-here-on-gcc-10-2-1-but-not-gcc-8-3-0 # https://stackoverflow.com/questions/75377874/many-multiple-definition-of-errors-shows-up-when-compiling-tpc-ds-tools cd ${WORKSPACE}/DSGen-software-code-3.2.0rc1/tools make -j$(proc) ``` ## 生成数据 ```shell WORKSPACE=${HOME}/tpc/tpcds scale=1 DATA_PATH=${WORKSPACE}/data/SF${scale} mkdir -p ${DATA_PATH} rm -f ${DATA_PATH}/*.dat # 生成数据1 cd ${WORKSPACE}/DSGen-software-code-3.2.0rc1/tools ./dsdgen -scale ${scale} -dir ${DATA_PATH} -terminate N # 生成数据2(分割成4个文件) #cd ${WORKSPACE}/DSGen-software-code-3.2.0rc1/tools #./dsdgen -scale ${scale} -dir ${DATA_PATH} -parallel 4 -child 1 & #./dsdgen -scale ${scale} -dir ${DATA_PATH} -parallel 4 -child 2 & #./dsdgen -scale ${scale} -dir ${DATA_PATH} -parallel 4 -child 3 & #./dsdgen -scale ${scale} -dir ${DATA_PATH} -parallel 4 -child 4 & #wait ``` ## to-parquet 转换成parquet: ```shell # 前提条件:拷贝转换脚本 # http://jiutian.dev.huawei.com/tools/benchmark/pytools/tpcds/ WORKSPACE=${HOME}/tpc/tpcds input_path="${WORKSPACE}/data/SF1" output_path="${WORKSPACE}/data/SF1/parquet" scale=1 cd ${HOME}/tpc/pytools/tpcds python3 generation.py \ --dataset_save_type=parquet \ --scale_size=1 \ --tbl_file_path=${input_path} \ --dataset_save_path=${output_path} ```