更新时间:2022-04-29 20:56:12
本文介绍一些常见的集群跑hive作业参数优化,可以根据业务需要来使用。
dfs.client.read.shortcircuit=true //直读
dfs.client.read.shortcircuit.streams.cache.size=4096 //直读缓存
dfs.datanode.balance.bandwidthPerSec=30048576 //提高balance带宽,一般扩容后调整
dfs.datanode.max.transfer.threads=16384 //提高线程数
dfs.namenode.checkpoint.period=21600 //延长checkpoint时间
dfs.namenode.handler.count=100 //并发数,大集群要提高
dfs.namenode.fslock.fair=false //降低写性能,但提高读锁性能
dfs.namenode.lifeline.handler.count=1 //ha集群优化,大集群使用
hive.metastore.server.max.threads=100000
hive.compactor.worker.threads=5
hive.metastore.client.socket.timeout=1800s
hive.metastore.failure.retries=5
hive.exec.max.dynamic.partitions=5000
hive.exec.max.dynamic.partitions.pernode=2000
set hive.execution.engine=tez;
SET hive.tez.auto.reducer.parallelism=true;
SET hive.tez.max.partition.factor=20;
STORED AS ORC tblproperties (“orc.compress" = “SNAPPY”)
hive.exec.orc.default.compress=SNAPPY
hive.exec.parallel=true
SET hive.exec.reducers.bytes.per.reducer=128000000;
set hive.vectorized.execution.enabled = true;
set hive.vectorized.execution.reduce.enabled = true;
hive.limit.optimize.enable=true
set hive.cbo.enable=true;
set hive.compute.query.using.stats=true;
set hive.stats.fetch.column.stats=true;
set hive.stats.fetch.partition.stats=true;
查询前先统计常用表的静态信息,常join的列
analyze table tweets compute statistics;
analyze table tweets compute statistics for columns sender, topic;
set hive.enforce.bucketing = true;
set hive.optimize.bucketmapjoin = true;
set hive.optimize.bucketmapjoin.sortedmerge = true;
set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;