大数据集群环境搭建

版本信息

Red Hat Enterprise Linux Server release 7.4
JDK1.8
Hadoop3.2.0
Spark3.2.0
Hive3.2.1
Python3.8.5

配置集群root用户互信

配置hostname

readonly prefix_ip="10.160.9."
for((i=11;i<=20;i++))
do
	host="${prefix_ip}${i}"
	ssh $host hostnamectl set-hostname "node9${i}"
done

生成集群ip和host文件cluster_hosts

# !/bin/bash
# gen_hosts.sh
readonly prefix_ip="10.160.9."

# 生成集群ip地址 10.160.9.11~10.160.9.20
for((i=11;i<=20;i++))
do
	host="${prefix_ip}${i}"
	hostname=$(ssh $host hostname)
	echo "$host $hostname"
done

# 执行上述脚本
gen_hosts.sh >> ~/cluster_hosts

# 分发
xsync.sh ~/cluster_hosts

清空原有ssh配置（按需）

cluster_all.sh rm -rf ~/.ssh

生成公钥私钥

cluster_all.sh ssh-keygen -t rsa

汇总公钥私钥

cluster_all.sh cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys

分发key文件

xsync.sh ~/.ssh/authorized_keys

至此，集群通过ip可无密登录

追加host信息到/etc/hosts

# 在节点1
cat /root/cluster_hosts >> /etc/hosts

# 分发
xsync.sh /etc/hosts

配置hostname无密登录

# 以下可直接复制到shell执行
hostnames=$(cut -d " " -f 2 /root/cluster_hosts)
for hostname in $hostnames
do
	ssh $hostname date
done

# 分发 ！！！！！注意，可能会覆盖原始known_hosts文件，慎重！！
xsync.sh ~/.ssh/known_hosts

hostname无密登录配置完成

集群新增节点操作

以新增10.160.9.22为例，在9.11上执行下述操作

重新生成cluster_hosts文件

# 临时增加节点数少的可以手动修改cluster_hosts后分发
# 节点数多使用gen_hosts.sh, 然后对cluster_hosts文件去重，再分发
# 去重
sort cluster_hosts | uniq > cluster_hosts.bak
mv cluster_hosts.bak cluster_hosts
# 分发
xsync.sh cluster_hosts

重新配置/etc/hosts文件

cat cluster_hosts >> /etc/hosts
# /etc/hosts去重
sort /etc/hosts | uniq > /etc/hosts.bak
mv /etc/hosts.bak /etc/hosts
# 分发
xsync.sh /etc/hosts

生成公钥私钥

# 不需要生成的，已有的，不要覆盖！！！
cluster_all.sh ssh-keygen -t rsa -f ~/.ssh/id_rsa

汇总公钥私钥，去重

cluster_all.sh cat ~/.ssh/authorized_keys >> ~/.ssh/authorized_keys
sort ~/.ssh/authorized_keys | uniq > ~/.ssh/authorized_keys.bak
mv ~/.ssh/authorized_keys.bak ~/.ssh/authorized_keys
# 分发
xsync.sh ~/.ssh/authorized_keys

配置hostname无密登录

hostnames=$(cut -d " " -f 2 /root/cluster_hosts)
for hostname in $hostnames
do
	ssh $hostname date
done

# 分发known_hosts
xsync.sh ~/.ssh/known_hosts

配置集群bigdata用户（按需）

尽量不使用root用户操作

为集群配置bigdata用户

cluster_all.sh groupadd bigdata
cluster_all.sh useradd -g bigdata bigdata
cluster_all.sh passwd bigdata

# 在bigdata用户目录下创建bin目录，方便执行脚本
cluster_all.sh mkdir /home/bigdata/bin

# 将root用户bin目录下的脚本脚本分发到各个节点
./xsync.sh /root/bin

# 节点1复制脚本到bigdata用户
cp ./xsync.sh ./cluster_all.sh /home/bigdata/bin

# 修改脚本权限
chgrp -R bigdata /home/bigdata/bin
chown -R bigdata /home/bigdata/bin 

# 分发bigdata的bin目录
./xsync.sh /home/bigdata/bin

至此，root用户工作完成，接下来所有操作可以在使用bigdata用户完成

配置bigdata用户ssh互信（和root用户完全相同）

切换为bigdata用户

su - bigdata
cd ~/bin
# 一直回车
cluster_all.sh ssh-keygen -t rsa
# 汇总
cluster_all.sh cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
# 分发总的key文件
xsync.sh ~/.ssh/authorized_keys

# 配置hostname免密
hosts=$(cut /opt/cluster_nodes -d " " -f 2)
for host in ${hosts[@]}
do
	ssh $host ls / >/dev/null
done

至此，bigdata用户集群互信配置完成

配置集群yum源

redhat自带的yum无法使用，需要自己安装centos的yum

下载必要的rpm包到/root/yum文件夹，可从内网nexus或者外网下载拷贝，版本可能不一样

# /root/yum
python-urlgrabber-3.10-10.el7.noarch.rpm
rpm-4.11.3-45.el7.x86_64.rpm
yum-3.4.3-168.el7.centos.noarch.rpm
yum-metadata-parser-1.1.4-10.el7.x86_64.rpm
yum-plugin-fastestmirror-1.1.31-54.el7_8.noarch.rpm
wget # 自带了

# 同步安装包
xsync.sh /root/yum

# 删除redhat自带的yum
cluster_all.sh 'rpm -qa|grep yum|xargs rpm -e --nodeps'

# 安装centos的yum
cluster_all.sh 'cd /root/yum;rpm -ivh --force python-urlgrabber-3.10-10.el7.noarch.rpm rpm-4.11.3-45.el7.x86_64.rpm yum-metadata-parser-1.1.4-10.el7.x86_64.rpm yum-3.4.3-168.el7.centos.noarch.rpm yum-plugin-fastestmirror-1.1.31-54.el7_8.noarch.rpm yum-3.4.3-168.el7.centos.noarch.rpm'

安装yum完成

配置repo文件

在节点1上，/etc/yum.repos.d/centos.repo

# /etc/yum.repos.d/centos.repo
# redhat版本为7
[base]
name=base
baseurl=http://10.160.8.81:8081/repository/centos-aliyun/7/os/$basearch/
enabled=1
gpgcheck=0

[updates]
name=updates
baseurl=http://10.160.8.81:8081/repository/centos-aliyun/7/updates/$basearch/
enabled=1
gpgcheck=0

[extras]
name=extras
baseurl=http://10.160.8.81:8081/repository/centos-aliyun/7/extras/$basearch/
enabled=1
gpgcheck=0

# 分发repo文件
xsync.sh /etc/yum.repos.d/centos.repo

cluster_all.sh yum clean all

cluster_all.sh yum update

clutser_all.sh yum makecache

集群yum配置完成

Java安装

安装JDk1.8，安装到/soft目录下

# 创建文件夹
cluster_all.sh mkdir -p /soft
tar -xzvf jdk-8u202-linux-x64.tar.gz -C /soft/
cd /soft
mv jdk1.8.0_202/ jdk1.8

# 分发文件
xsync.sh /soft/jdk1.8

配置环境变量

vim /etc/profile.d/bigdata.sh
# 添加
export JAVA_HOME=/soft/jdk1.8
export PATH=$PATH:$JAVA_HOME/bin

# 分发
xsync.sh /etc/profile.d/bigdata.sh

source /etc/profile

删除自带java链接

whereis java #java: /usr/bin/java /usr/lib/java /etc/java /usr/share/java /soft/jdk1.8/bin/java /usr/share/man/man1/java.1.gz
ll /usr/bin/java | grep java # /usr/bin/java -> /etc/alternatives/java
cluster_all.sh 'cd /usr/bin;rm -f java javaws'

源码安装Python

下面为节点1安装Python3.8.5，使用集群脚本可以全集群安装

# 安装依赖
yum install gcc openssl-devel bzip2-devel libffi-devel -y

cd /soft
tar -xzvf Python-3.8.5.tgz
cd Python-3.8.5
./configure --enable-optimizations
make altinstall

# python3.8
Python 3.8.5 (default, Feb 15 2023, 08:52:15) 
[GCC 4.8.5 20150623 (Red Hat 4.8.5-44)] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> 


# pip3.8
[root@node911 Python-3.8.5]# pip3.8

Usage:   
  pip3.8 <command> [options]

配置pip源

mkdir ~/.pip
vim ~/.pip/pip.conf

# 添加以下内容
[global]
index-url=http://10.160.8.81:8081/repository/pypi-aliyun/simple/
trusted-host=10.160.8.81

# 安装virtualenv测试
pip3.8 install virtualenv

Hadoop安装配置

Hadoop3 HA高可用集群搭建

Zookeeper安装配置

安装zookeeper

cd /soft
tar -xzvf apache-zookeeper-3.5.8.tar.gz
mv apache-zookeeper-3.5.8 zookeeper3.5.8
xsync.sh zookeeper3.5.8

配置环境变量

vim /etc/profile.d/bigdata.sh
export ZOOKEEPER_HOME=/soft/zookeeper3.5.8

# 激活
source /etc/profile

zookeepr配置文件

vim $ZOOKEEPER_HOME/conf/zoo.cfg

tickTime=2000
# The number of ticks that the initial 
# synchronization phase can take
initLimit=10
# The number of ticks that can pass between 
# sending a request and getting an acknowledgement
syncLimit=5
# the directory where the snapshot is stored.
# do not use /tmp for storage, /tmp here is just 
# example sakes.
dataDir=/data/zookeeper
# the port at which the clients will connect
clientPort=2181
# the maximum number of client connections.
# increase this if you need to handle more clients
#maxClientCnxns=60
#
# Be sure to read the maintenance section of the 
# administrator guide before turning on autopurge.
#
# http://zookeeper.apache.org/doc/current/zookeeperAdmin.html#sc_maintenance
#
# The number of snapshots to retain in dataDir
#autopurge.snapRetainCount=3
# Purge task interval in hours
# Set to "0" to disable auto purge feature
#autopurge.purgeInterval=1
dataLogDir=/var/zookeeper/log
clientPort=2182
autopurge.purgeInterval=0
globalOutstandingLimit=200
server.1=node911:2888:3888
server.2=node912:2888:3888
server.3=node913:2888:3888
server.4=node914:2888:3888
server.5=node915:2888:3888
server.6=node916:2888:3888
server.7=node919:2888:3888
server.8=node920:2888:3888
server.9=LENOVO-LA0X1771:2888:3888

配置my.id

# !/bin/bash

hosts=($(cut -d " " -f 1 ~/cluster_hosts))
node_cnt=${#hosts[@]}

for ((i=0;i<${node_cnt};i++))
do
        id=$(($i+1))
        echo "${hosts[$i]} $id"
        ssh ${hosts[$i]} "cd /data/zookeeper;echo ${id} > myid"
done

启动zookeeper

myzk.sh start

# 查看状态
myzk.sh status
[root@node911 bin]# ./zkServer.
zkServer.cmd  zkServer.sh   
[root@node911 bin]# ./zkServer.sh status
ZooKeeper JMX enabled by default
Using config: /soft/zookeeper3.5.8/bin/../conf/zoo.cfg
Client port found: 2182. Client address: localhost.
Mode: follower
...

myzk.sh stop

安装配置clickhouse

安装软件

cd /soft/clickhouse
ll

total 1074536
-rw-r--r-- 1 root root     88071 Feb 16 10:07 clickhouse-client-22.9.7.34.x86_64.rpm
-rw-r--r-- 1 root root 248224665 Feb 16 10:07 clickhouse-common-static-22.9.7.34.x86_64.rpm
-rw-r--r-- 1 root root 851891397 Feb 16 10:07 clickhouse-common-static-dbg-22.9.7.34.x86_64.rpm
-rw-r--r-- 1 root root    114011 Feb 16 10:07 clickhouse-server-22.9.7.34.x86_64.rpm

xsync.sh .

cluster_all.sh 'cd /soft/clickhouse;rpm -ivh *.rpm'

启动、登录

/etc/init.d/clickhouse-server  start | stop | status

# 登录
clickhouse-client --password [--host localhost] [--user default] [--port 9000]

配置文件

clickhouse默认配置文件在/etc/clickhouse-server/config.xml中，其中对配置进行了比较详细的注释，用户配置文件在users.xml中，这两个文件不建议进行更改，而是在config.d和user.d中新建配置文件，这两个文件夹中的文件默认会被加载，合并覆盖默认配置文件。
部分配置新旧版本存在差异。具体可查看配置文件文档

根节点统一为clickhouse，旧版本存在yandex
集群配置节点为remote_servers
zookeeper配置节点为zookeeper

新建的配置文件需要将用户和用户组改为clickhouse！！！

配置用户

<users>
        <!-- If user name was not specified, 'default' user is used. -->
        <default>
            <password>123456</password>
	   <password_sha256_hex>fd20eb05f734d2952b9551f8766664005a33e7b7955eb7b2dff34769b982fbbd</password_sha256_hex>

            <networks incl="networks" replace="replace">
                <ip>10.160.73.56</ip>
                <ip>127.0.0.1</ip>
            </networks>

            <!-- Settings profile for user. -->
            <profile>default</profile>

            <!-- Quota for user. -->
            <quota>default</quota>

            <!-- User can create other users and grant rights to them. -->
            <!-- <access_management>1</access_management> -->
        </default>
	<etl>
	   <password_sha256_hex>fd20eb05f734d2952b9551f8766664005a33e7b7955eb7b2dff34769b982fbbd</password_sha256_hex>
           <networks incl="networks" replace="replace">
                <ip>10.160.73.57</ip>
	  			 <ip>127.0.0.1</ip>
           </networks> 
	   <profile>default</profile>
	   <quota>default</quota>
            <access_management>1</access_management>
	</etl>
    </users>

端口配置

<!-- 根节点clickhouse -->
<clickhouse>
  <!-- 覆盖默认的9000 -->
   <tcp_port>9002</tcp_port>
    <listen_host>::</listen_host>
</clickhouse>

分片副本配置

3个节点，配置2个分片，其中第1分片2个副本，第2分片1个副本

<!-- centos1配置文件 -->
<clickhouse>
        <remote_servers>
        <cluster>
          	<!-- 第1分片 -->
            <shard>
                <internal_replication>true</internal_replication>
                <!-- 第1分片的第1个副本在centos1 9002 -->
                <replica>
                    <host>centos1</host>
                    <port>9002</port>
                </replica>
              <!-- 第1分片的第2个副本在centos2 9002 -->
                <replica>
                    <host>centos2</host>
                    <port>9002</port>
                </replica>
            </shard>
          <!-- 第2分片 -->
            <shard>
                <internal_replication>true</internal_replication>
              <!-- 第2分片的第1个副本在centos3 9002 -->
                <replica>
                    <host>centos3</host>
                    <port>9002</port>
                </replica>
            </shard>
        </cluster>
</remote_servers>
  <macros>
<shard>01</shard> <!--不同机器放的分片数不一样，01第一分片-->
<replica>rep_1_1</replica> <!--不同机器放的副本数不一样 rep_1_1 第一分片的第一个副本-->
</macros>
    <zookeeper>
        <node>
            <host>centos1</host>
            <port>2181</port>
        </node> 
        <node>
            <host>centos2</host>
            <port>2181</port>
        </node>
        <node>
            <host>centos3</host>
            <port>2181</port>
        </node>
        </zookeeper>
</clickhouse>

分布式表

修改配置文件后，重启集群

在任意节点登录

clickhouse-client --port 9002

创建分布式数据库

-- cluster名称与xml中标签相同
centos1 :) create database test_cluster on cluster cluster;

CREATE DATABASE test_cluster ON CLUSTER cluster

Query id: c901e6dc-b797-498c-ba7f-3bf0f20ce77a

┌─host────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐
│ centos1 │ 9002 │      0 │       │                   2 │                0 │
│ centos3 │ 9002 │      0 │       │                   1 │                0 │
│ centos2 │ 9002 │      0 │       │                   0 │                0 │
└─────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘

3 rows in set. Elapsed: 0.123 sec.

其他两个节点同步创建数据库
4. 创建分布式表

centos1 :) create table st_order on cluster cluster (
           id UInt32,
           sku_id String,
           total_amount Decimal(16,2),
           create_time Datetime
           ) engine
           =ReplicatedMergeTree('/clickhouse/tables/{shard}/st_order','{replica}')
           partition by toYYYYMMDD(create_time)
           primary key (id)
           order by (id,sku_id);

CREATE TABLE st_order ON CLUSTER cluster
(
    `id` UInt32,
    `sku_id` String,
    `total_amount` Decimal(16, 2),
    `create_time` Datetime
)
-- 对应macros标签中的shard和replica
ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/st_order', '{replica}')
PARTITION BY toYYYYMMDD(create_time)
PRIMARY KEY id
ORDER BY (id, sku_id)

Query id: d9a64d49-d322-4945-aadd-92561c4bb0bb

┌─host────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐
│ centos1 │ 9002 │      0 │       │                   2 │                0 │
│ centos3 │ 9002 │      0 │       │                   1 │                0 │
│ centos2 │ 9002 │      0 │       │                   0 │                0 │
└─────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘

3 rows in set. Elapsed: 0.124 sec.

分布式视图

centos1 :) create table st_order_all on cluster cluster
           (
           id UInt32,sku_id String,
           total_amount Decimal(16,2),
           create_time Datetime
           )engine = Distributed(cluster,default, st_order_mt,hiveHash(sku_id));

CREATE TABLE st_order_all ON CLUSTER cluster
(
    `id` UInt32,
    `sku_id` String,
    `total_amount` Decimal(16, 2),
    `create_time` Datetime
)
ENGINE = Distributed(cluster, default, st_order, hiveHash(sku_id))

Query id: 1f4d3d9f-fd07-464b-8092-7aaef869c5a0

┌─host────┬─port─┬─status─┬─error─┬─num_hosts_remaining─┬─num_hosts_active─┐
│ centos1 │ 9002 │      0 │       │                   2 │                0 │
│ centos3 │ 9002 │      0 │       │                   1 │                0 │
│ centos2 │ 9002 │      0 │       │                   0 │                0 │
└─────────┴──────┴────────┴───────┴─────────────────────┴──────────────────┘

3 rows in set. Elapsed: 0.121 sec.

测试数据

centos1 :) insert into st_order_all values
           (201,'sku_001',1000.00,'2020-06-01 12:00:00') ,
           (202,'sku_002',2000.00,'2020-06-01 12:00:00'),
           (203,'sku_004',2500.00,'2020-06-01 12:00:00'),
           (204,'sku_002',2000.00,'2020-06-01 12:00:00'),
           (205,'sku_003',600.00,'2020-06-02 12:00:00');

INSERT INTO st_order_all FORMAT Values

Query id: 6e3c8134-e00f-4d40-a84d-d014b0d1504a

Ok.

5 rows in set. Elapsed: 0.014 sec. 

-- 本地表
centos1 :) select * from st_order;

SELECT *
FROM st_order

Query id: 5dbf9a23-00fe-444a-9961-96155d4b8168

┌──id─┬─sku_id──┬─total_amount─┬─────────create_time─┐
│ 202 │ sku_002 │         2000 │ 2020-06-01 12:00:00 │
│ 203 │ sku_004 │         2500 │ 2020-06-01 12:00:00 │
│ 204 │ sku_002 │         2000 │ 2020-06-01 12:00:00 │
└─────┴─────────┴──────────────┴─────────────────────┘

3 rows in set. Elapsed: 0.003 sec.

-- 分布式表
centos1 :) select * from st_order_all;

SELECT *
FROM st_order_all

Query id: 92bd7c59-780d-491d-bd3d-66aaf8ebccfc

┌──id─┬─sku_id──┬─total_amount─┬─────────create_time─┐
│ 202 │ sku_002 │         2000 │ 2020-06-01 12:00:00 │
│ 203 │ sku_004 │         2500 │ 2020-06-01 12:00:00 │
│ 204 │ sku_002 │         2000 │ 2020-06-01 12:00:00 │
└─────┴─────────┴──────────────┴─────────────────────┘
┌──id─┬─sku_id──┬─total_amount─┬─────────create_time─┐
│ 201 │ sku_001 │         1000 │ 2020-06-01 12:00:00 │
└─────┴─────────┴──────────────┴─────────────────────┘
┌──id─┬─sku_id──┬─total_amount─┬─────────create_time─┐
│ 205 │ sku_003 │          600 │ 2020-06-02 12:00:00 │
└─────┴─────────┴──────────────┴─────────────────────┘

5 rows in set. Elapsed: 0.009 sec.

集群实用脚本

为方便统一管理，将sh脚本放在/usr/local/bin目录中

cluster_all.sh

#!/bin/bash
# cluster_all.sh 对集群执行的命令

# 第一参数为空，直接退出
if [ -z "$1" ]
then
	echo "Error Arg!"
	exit 1
fi

hosts=$(cut -d " " -f 1 ~/cluster_hosts)

# 提取命令
command=$(echo $* | cut -d " " -f 1-)

for host in ${hosts[@]}
do
	# 集群主机密码 bigdata
	# sshpass -p bigdata ssh -o StrictHostKeyChecking=no $host $command
	# 集群执行命令
	ssh $host $command
	
	# 获取所有节点ip和hostname
	# hostname=$(sshpass -p bigdata ssh $host $command)
	# echo "$host $hostname"
done

xsync.sh

#!/bin/bash

if [ $# -lt 1 ]
then
        echo "error arg"
        exit 1
fi

hosts=($(cut -d " " -f 1 /root/cluster_hosts))


for host in ${hosts[@]}
do
        echo "=========== $host ============"   
        for file in $@
        do
                if [ ! -e "$file" ]
                then
                        echo "file not exists: $file"
                        break
                fi

                if [ -d "$file" ]
                then
                        cur_dir=$(cd -P ${file}; pwd)
                        pdir=$(cd -P $(dirname ${cur_dir}); pwd)
                        fname=$(basename ${cur_dir})
                else
                        pdir=$(cd -P $(dirname ${file}); pwd)
                        fname=$(basename $file)
                fi

                if [ "$pdir" != "/" ]
                then
                        ssh $host mkdir -p $pdir
                fi

                #echo "ssh $host mkdir -p $pdir"
                #echo "rsync -av $pdir/$fname $host:$pdir"
                rsync -av --delete $pdir/$fname $host:$pdir
        done
done

Q.E.D.

Hi,Friend

大数据集群环境搭建

版本信息

配置集群root用户互信

配置hostname

生成集群ip和host文件cluster_hosts

清空原有ssh配置（按需）

生成公钥私钥

汇总公钥私钥

分发key文件

追加host信息到/etc/hosts

配置hostname无密登录

集群新增节点操作

配置集群bigdata用户（按需）

为集群配置bigdata用户

配置bigdata用户ssh互信（和root用户完全相同）

配置集群yum源

配置repo文件

Java安装

安装JDk1.8，安装到/soft目录下

配置环境变量

删除自带java链接

源码安装Python

配置pip源

Hadoop安装配置

Zookeeper安装配置

安装zookeeper

配置环境变量

zookeepr配置文件

配置my.id

启动zookeeper

安装配置clickhouse

安装软件

启动、登录

配置文件

配置用户

端口配置

分片副本配置

分布式表

集群实用脚本

cluster_all.sh

xsync.sh

Hadoop3 HA高可用集群搭建

Hive3.1.2安装配置（Linux和Windows）

如风

__如风__

如风