From 2877007dcb6cb4def9e46f5485ce21e73ccace0d Mon Sep 17 00:00:00 2001 From: Ashesh Kumar Singh Date: Fri, 25 Jan 2019 02:54:23 +0530 Subject: [PATCH] Minor Edits --- _config.yml | 6 +- _includes/head.html | 3 +- _layouts/default.html | 55 ++-- ...p-multi-node-cluster-installation-guide.md | 203 ++++++++------- ...-single-node-cluster-installation-guide.md | 238 ++++++++++-------- index.html | 3 +- 6 files changed, 292 insertions(+), 216 deletions(-) diff --git a/_config.yml b/_config.yml index ac30942..3352323 100644 --- a/_config.yml +++ b/_config.yml @@ -1,6 +1,6 @@ # Dependencies markdown: redcarpet -highlighter: pygments +highlighter: rouge # Permalinks permalink: pretty @@ -15,7 +15,7 @@ baseurl: /BD_STTP_2016 author: name: 'Ashesh Kumar Singh' - url: https://plus.google.com/+AsheshKumar501254 + url: https://user501254.github.io paginate: 5 @@ -25,4 +25,4 @@ version: 2.1.0 github: repo: https://github.com/user501254/BD_STTP_2016 -gems: [jekyll-paginate, jekyll-gist] +plugins: [jekyll-paginate, jekyll-gist] diff --git a/_includes/head.html b/_includes/head.html index e97bc00..49f623b 100644 --- a/_includes/head.html +++ b/_includes/head.html @@ -22,8 +22,9 @@ - + + diff --git a/_layouts/default.html b/_layouts/default.html index 54929e0..18638d9 100644 --- a/_layouts/default.html +++ b/_layouts/default.html @@ -32,24 +32,47 @@ - - - - + + + diff --git a/hadoop-multi-node-cluster-installation-guide.md b/hadoop-multi-node-cluster-installation-guide.md index ffbd446..5988aed 100644 --- a/hadoop-multi-node-cluster-installation-guide.md +++ b/hadoop-multi-node-cluster-installation-guide.md @@ -3,116 +3,135 @@ layout: page title: Hadoop Multi-Node Cluster Installation Guide --- +## Prerequisites -**Hadoop (2.7.1) Multi-Node cluster configuration** +Make sure that you have a reliable network without host isolation. Static IP assignment is preferable or at-least have extremely long DHCP lease. Additionally, all nodes (Namenode/master & Datanodes/slaves) should have a common user account with same password; in case you don't, make such user account on all nodes. Having the same username and password on all nodes makes things a bit less complicated. +First configure all nodes for single-node cluster. You can use my script that I have posted over [here](https://github.com/user501254/BD_STTP_2016/blob/master/InstallHadoop.sh). -1. Make sure that you have a reliable network without host isolation. Static IP assignment is preferable or at-least have extremely long DHCP lease. Additionally all nodes (Namenode/master & Datanodes/slaves) should have a common user account with same password; in case you don't, make such user account on all nodes. Having same username and password on all nodes makes things a bit less complicated. -2. *[on all machines]* First configure all nodes for single-node cluster. You can use my script that I have posted over [here](https://github.com/user501254/BD_STTP_2016/blob/master/InstallHadoop.sh). -3. execute these commands in a new terminal - - *[on all machines]* ↴ +## STEP 1: Stopping Hadoop Daemons and cleaning up HDFS files - stop-dfs.sh;stop-yarn.sh;jps - rm -rf /tmp/hadoop-$USER +1. **On all nodes**, confirm that the daemons have stopped by running `jps` command. + `stop-dfs.sh; stop-yarn.sh; rm -rf /tmp/hadoop-$USER` - *[on Namenode/master only]* ↴ +2. **On Namenode/master only**, remove the datanode directory from HDFS. + `rm -rf ~/hadoop_store/hdfs/datanode` - rm -rf ~/hadoop_store/hdfs/datanode +3. **On Datanodes/slaves only**, remove the namenode directory from HDFS. + `rm -rf ~/hadoop_store/hdfs/namenode` - *[on Datanodes/slaves only]* ↴ - rm -rf ~/hadoop_store/hdfs/namenode -4. *[on all machines]* Add IP addresses and corresponding Host names for all nodes in the cluster. - - sudo nano /etc/hosts +## STEP 2: Configuring connectivity - hosts +1. **On all nodes**, add IP addresses and corresponding Hostnames for all nodes in the cluster. + `sudo nano /etc/hosts` + + The `/etc/hosts` file should look somewhat like this after you edit it. xxx.xxx.xxx.xxx master xxx.xxx.xxx.xxy slave1 xxx.xxx.xxx.xxz slave2 - # Additionally you may need to remove lines like "xxx.xxx.xxx.xxx localhost", "xxx.xxx.xxx.xxy localhost", "xxx.xxx.xxx.xxz localhost" etc if they exist. - # However it's okay keep lines like "127.0.0.1 localhost" and others. - -5. *[on all machines]* Configure iptables - Allow default or custom ports that you plan to use for various Hadoop daemons through the firewall + Additionally you may need to remove lines like + "xxx.xxx.xxx.xxx localhost" etc if they exist. + However it's okay keep lines like "127.0.0.1 localhost" and others. + +2. **On all nodes**, configure the firewall + + Allow default or custom ports that you plan to use for various Hadoop daemons through the firewall. OR - much easier, disable iptables - - on RedHat like distros (Fedora, CentOS) - - sudo systemctl disable firewalld - sudo systemctl stop firewalld - - on Debian like distros (Ubuntu) - - sudo ufw disable - -6. *[on Namenode/master only]* Gain ssh access from Namenode (master) to all Datnodes (slaves). - - ssh-copy-id -i ~/.ssh/id_rsa.pub $USER@slave1 - ssh-copy-id -i ~/.ssh/id_rsa.pub $USER@slave2 - confirm things by running `ping slave1`, `ssh slave1`, `ping slave2`, `ssh slave2` etc. You should have a proper response. (Remember to exit each of your ssh sessions by typing `exit` or closing the terminal. To be on the safer side I also made sure that all nodes were able to access each other and not just the Namenode/master.) -7. *[on all machines]* edit core-site.xml file - - nano /usr/local/hadoop/etc/hadoop/core-site.xml - core-site.xml - - - - fs.defaultFS - hdfs://master:9000 - NameNode URI - - -8. *[on all machines]* edit yarn-site.xml file - - nano /usr/local/hadoop/etc/hadoop/yarn-site.xml - yarn-site.xml - - - - yarn.resourcemanager.hostname - master - The hostname of the RM. - - - yarn.nodemanager.aux-services - mapreduce_shuffle - - - yarn.nodemanager.aux-services.mapreduce.shuffle.class - org.apache.hadoop.mapred.ShuffleHandler - - -9. *[on all machines]* modify slaves file, remove the text "localhost" and add slave hostnames - - nano /usr/local/hadoop/etc/hadoop/slaves - slaves - - slave1 - slave2 - (I guess having this only on Namenode/master will also work but I did this on all machines anyway. Also note that in this configuration master behaves only as resource manger, this is how I intent it to be.) -10. *[on all machines]* modify hdfs-site.xml file to change the value for property `dfs.replication` to something > 1 (at-least to the number of slaves in the cluster; here I have two slaves so I would set it to 2) -11. *[on Namenode/master only]* (re)format the HDFS through namenode - - hdfs namenode -format -12. *[optional]* - - remove `dfs.datanode.data.dir` property from master's hdfs-site.xml file. - - remove `dfs.namenode.name.dir` property from all slave's hdfs-site.xml file. - - -**TESTING (execute only on Namenode/master)** - - start-dfs.sh;start-yarn.sh + Much easier, disable Firewall `iptables` (never on production system). - echo "hello world hello Hello" > ~/Downloads/test.txt + - on RedHat like distros (Fedora, CentOS) + `sudo systemctl stop firewalld; sudo systemctl disable firewalld` - hadoop fs -mkdir /input + - on Debian like distros (Ubuntu) + `sudo ufw disable` - hadoop fs -put ~/Downloads/test.txt /input +3. **On Namenode/master only**, gain `ssh` access from Namenode (master) to all Datnodes (slaves). + + `ssh-copy-id -i ~/.ssh/id_rsa.pub $USER@slave1` + + Then confirm connectivity by running `ping slave1`, `ssh slave1`. + + `ssh-copy-id -i ~/.ssh/id_rsa.pub $USER@slave2` + + Then confirm connectivity by running `ping slave2`, `ssh slave2`. + + Make sure taht you get a proper response. + Remember to exit each of your ssh sessions by typing `exit` or closing the terminal. + To be on the safer side, also made sure that all datanodes are also able to access each other. + + +## STEP 3: Editing Configuration Files + +1. **On all nodes**, edit Hadoop's `core-site.xml` file + + `nano /usr/local/hadoop/etc/hadoop/core-site.xml` + + ```xml + + + fs.defaultFS + hdfs://master:9000 + NameNode URI + + + ``` - hadoop jar /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.1.jar wordcount /input /output +2. **On all nodes**, edit Hadoop's `yarn-site.xml` file -wait for a few seconds and the mapper and reducer should begin. + `nano /usr/local/hadoop/etc/hadoop/yarn-site.xml ` + + ```xml + + + yarn.resourcemanager.hostname + master + The hostname of the RM. + + + yarn.nodemanager.aux-services + mapreduce_shuffle + + + yarn.nodemanager.aux-services.mapreduce.shuffle.class + org.apache.hadoop.mapred.ShuffleHandler + + + ``` + +7. **On all nodes**, edit Hadoop's `slaves` file, remove the text "localhost" and add slave hostnames + + `nano /usr/local/hadoop/etc/hadoop/slaves` + + slave1's hostname + slave2's hostname + + I guess having this only on Namenode/master will also work but I did this on all nodes anyway. + Also note that in this configuration master behaves only as resource manger, this is how I intent it to be. + +8. **On all nodes**, modify `hdfs-site.xml` file to change the value for property `dfs.replication` to something > 1. It should be equal to at-least the number of slaves in the cluster; here I have two slaves so I would set it to 2. + +9. **On Namenode/master only**, (re)format the HDFS through namenode + + `hdfs namenode -format` + +10. **Optional** + - remove `dfs.datanode.data.dir` property from master's `hdfs-site.xml` file. + - remove `dfs.namenode.name.dir` property from all slave's `hdfs-site.xml` file. + + +## Testing our setup (execute only on Namenode/master) + + start-dfs.sh; start-yarn.sh + + echo "hello world hello Hello" > ~/Downloads/test.txt + + hadoop fs -mkdir /input + hadoop fs -put ~/Downloads/test.txt /input + hadoop jar \ + /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \ + wordcount \ + /input /output diff --git a/hadoop-single-node-cluster-installation-guide.md b/hadoop-single-node-cluster-installation-guide.md index 910a6e5..f9a32a4 100644 --- a/hadoop-single-node-cluster-installation-guide.md +++ b/hadoop-single-node-cluster-installation-guide.md @@ -3,77 +3,80 @@ layout: page title: Hadoop Single-Node Cluster Installation Guide --- -##Prerequisites +## Prerequisites -You can install Hadoop in a *Single-Node Cluster* or *Pseudo-Distributed Mode* for testing purpose on any UNIX like system running on either a physical machine or a Virtual Machine. +You can install Hadoop in a *Single-Node Cluster* or *Pseudo-Distributed Mode* for testing purpose on any UNIX like system running on either a physical machine or a Virtual Machine. I am assuming that you already have your operating system prepared for this. However, if you don't have a working copy of a compatible operating system already installed, you can see [this link](http://askubuntu.com/questions/6328/how-do-i-install-ubuntu) for help. +This brief tutorial will focus only on RedHat (like Fedora, CentOS) and Debian (like Ubuntu) distributions but the steps remain similar on other OSs. If you face any issues try steps mentioned in the Troubleshooting section. -I am assuming that you already have your operating system prepaired for this. However if you don't have a working copy of a compatible operating system already installed, you can see [this link](http://askubuntu.com/questions/6328/how-do-i-install-ubuntu) for help. -This brief tutorial will focus only on RedHat (like Fedora, CentOS) and Debian (like Ubuntu) distributions but the steps remain similar on other OSs. -If you face any issues try steps mentioned in the Troubleshoothing section. - -If the manual installation process is not working for you, have a look at this [YouTube video](https://youtu.be/gWkbPVNER5k) or simply use the automated [installation script](https://github.com/user501254/BD_STTP_2016/blob/master/InstallHadoop.sh). - -##STEP 1: Installing Java, OpenSSH, rsync +## STEP 1: Installing Java, OpenSSH, rsync We need to install certain dependencies before installing Hadoop. This includes Java, OpenSSH and rsync. -On Redhat like systems use: - - sudo yum clean expire-cache && sudo yum install -y java-*-openjdk-devel openssh rsync - -On Debian like systems use: - - sudo apt-get update && sudo apt-get install -y default-jdk openssh-server rsync +On Redhat like systems use: +`sudo yum clean expire-cache && sudo yum install -y java-*-openjdk-devel openssh rsync` +On Debian like systems use: +`sudo apt-get update && sudo apt-get install -y default-jdk openssh-server rsync` -##STEP 2: Setting up SSH keys -Genrate passwordless RSA public & private keys, you will be required to answer a prompt by hitting enter to keep the default file location of the keys: +## STEP 2: Setting up SSH keys - ssh-keygen -t rsa -P '' +Genrate passwordless RSA public & private keys, you will be required to answer a prompt by hitting enter to keep the default file location of the keys: +`ssh-keygen -t rsa -P ''` -Add the newly created key to the list of authorized keys: +Add the newly created key to the list of authorized keys: +`cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys` - cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys +## STEP 3: Downloading and Extracting Hadoop archive -##STEP 3: Downloading and Extracting Hadoop archive You can download the latest stable release of Hadoop binary named *`hadoop-x.y.z.tar.gz`* from [`http://www.eu.apache.org/dist/hadoop/common/stable/`](http://www.eu.apache.org/dist/hadoop/common/stable/). -Download the file to your home folder: +1. Download the file to your `~/` (home) folder: + ```bash FILE=$(wget "http://www.eu.apache.org/dist/hadoop/common/stable/" -O - | grep -Po "hadoop-[0-9].[0-9].[0-9].tar.gz" | head -n 1) URL=http://www.eu.apache.org/dist/hadoop/common/stable/$FILE wget -c "$URL" -O "$FILE" + ``` -Extract the downloaded file to `/usr/local/` directory and then rename the just extracted `hadoop-x.y.z` directory to `hadoop` and make yourself the owner: +2. Extract the downloaded file to `/usr/local/` directory and then rename the just extracted `hadoop-x.y.z` directory to `hadoop` and make yourself the owner. - sudo tar xfz "$FILE" -C /usr/local - sudo mv /usr/local/hadoop-*/ /usr/local/hadoop - sudo chown -R $USER:$USER /usr/local/hadoop + Extract the downloaded pre-complied hadoop binary: + `sudo tar xfzv "$FILE" -C /usr/local` + + Move the extracted contecnts to `/user/local/hadoop` directory: + `sudo mv /usr/local/hadoop-*/ /usr/local/hadoop` + + Change ownership of hadoop directory (so that we don't need `sudo` everytime): + `sudo chown -R $USER:$USER /usr/local/hadoop` + + +## STEP 4: Editing Configuration Files -##STEP 4: Editing Configuration Files Now we need to make changes to a few configuration files. 1. To append text to your `~/.bashrc` file, execute this block of code in the terminal: - cat << 'EOT' >> ~/.bashrc - #SET JDK - export JAVA_HOME=$(readlink -f /usr/bin/java | sed "s:jre/bin/java::") - #HADOOP VARIABLES START - export HADOOP_HOME=/usr/local/hadoop - export PATH=$PATH:$HADOOP_HOME/bin - export PATH=$PATH:$HADOOP_HOME/sbin - export HADOOP_MAPRED_HOME=$HADOOP_HOME - export HADOOP_COMMON_HOME=$HADOOP_HOME - export HADOOP_HDFS_HOME=$HADOOP_HOME - export YARN_HOME=$HADOOP_HOME - export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native - export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib/native" - export HADOOP_CLASSPATH=${JAVA_HOME}/lib/tools.jar - #HADOOP VARIABLES END - EOT + ```bash + cat << 'EOT' >> ~/.bashrc + #SET JDK + export JAVA_HOME=$(readlink -f /usr/bin/java | sed "s:jre/bin/java::") + #HADOOP VARIABLES START + export HADOOP_HOME=/usr/local/hadoop + export PATH=$PATH:$HADOOP_HOME/bin + export PATH=$PATH:$HADOOP_HOME/sbin + export HADOOP_MAPRED_HOME=$HADOOP_HOME + export HADOOP_COMMON_HOME=$HADOOP_HOME + export HADOOP_HDFS_HOME=$HADOOP_HOME + export YARN_HOME=$HADOOP_HOME + export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native + export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib/native" + export HADOOP_CLASSPATH=${JAVA_HOME}/lib/tools.jar + #HADOOP VARIABLES END + EOT + ``` 2. To edit `/usr/local/hadoop/etc/hadoop/hadoop-env.sh` file, execute this block of code in the terminal: @@ -81,77 +84,106 @@ Now we need to make changes to a few configuration files. 3. To edit `/usr/local/hadoop/etc/hadoop/core-site.xml` file, execute this block of code in the terminal: - sed -n -i.bak '//q;p' /usr/local/hadoop/etc/hadoop/core-site.xml - cat << EOT >> /usr/local/hadoop/etc/hadoop/core-site.xml - - - fs.defaultFS - hdfs://localhost:9000 - - - EOT + ```bash + sed -n -i.bak '//q;p' /usr/local/hadoop/etc/hadoop/core-site.xml + cat << EOT >> /usr/local/hadoop/etc/hadoop/core-site.xml + + + fs.defaultFS + hdfs://localhost:9000 + + + EOT + ``` 4. To edit `/usr/local/hadoop/etc/hadoop/yarn-site.xml` file, execute this block of code in the terminal: - sed -n -i.bak '//q;p' /usr/local/hadoop/etc/hadoop/yarn-site.xml - cat << EOT >> /usr/local/hadoop/etc/hadoop/yarn-site.xml - - - yarn.nodemanager.aux-services - mapreduce_shuffle - - - yarn.nodemanager.aux-services.mapreduce.shuffle.class - org.apache.hadoop.mapred.ShuffleHandler - - - EOT + ```bash + sed -n -i.bak '//q;p' /usr/local/hadoop/etc/hadoop/yarn-site.xml + cat << EOT >> /usr/local/hadoop/etc/hadoop/yarn-site.xml + + + yarn.nodemanager.aux-services + mapreduce_shuffle + + + yarn.nodemanager.aux-services.mapreduce.shuffle.class + org.apache.hadoop.mapred.ShuffleHandler + + + EOT + ``` 5. To genrate and then edit `/usr/local/hadoop/etc/hadoop/mapred-site.xml` file, execute this block of code in the terminal: - cp /usr/local/hadoop/etc/hadoop/mapred-site.xml.template /usr/local/hadoop/etc/hadoop/mapred-site.xml - sed -n -i.bak '//q;p' /usr/local/hadoop/etc/hadoop/mapred-site.xml - cat << EOT >> /usr/local/hadoop/etc/hadoop/mapred-site.xml - - - mapreduce.framework.name - yarn - - - EOT + ```bash + cp /usr/local/hadoop/etc/hadoop/mapred-site.xml.template /usr/local/hadoop/etc/hadoop/mapred-site.xml + sed -n -i.bak '//q;p' /usr/local/hadoop/etc/hadoop/mapred-site.xml + cat << EOT >> /usr/local/hadoop/etc/hadoop/mapred-site.xml + + + mapreduce.framework.name + yarn + + + EOT + ``` 6. To make `~/hadoop_store/hdfs/namenode`, `~/hadoop_store/hdfs/datanode` folders and edit `/usr/local/hadoop/etc/hadoop/hdfs-site.xml` file, execute this block of code in the terminal: - mkdir -p ~/hadoop_store/hdfs/namenode - mkdir -p ~/hadoop_store/hdfs/datanode - sed -n -i.bak '//q;p' /usr/local/hadoop/etc/hadoop/hdfs-site.xml - cat << EOT >> /usr/local/hadoop/etc/hadoop/hdfs-site.xml - - - dfs.replication - 1 - - - dfs.namenode.name.dir - file:/home/$USER/hadoop_store/hdfs/namenode - - - dfs.datanode.data.dir - file:/home/$USER/hadoop_store/hdfs/datanode - - - -##STEP 5: Formatting HDFS -Before we can start using our hadoop cluster, we need to format the HDFS through the namenode. + ```bash + mkdir -p ~/hadoop_store/hdfs/namenode + mkdir -p ~/hadoop_store/hdfs/datanode + sed -n -i.bak '//q;p' /usr/local/hadoop/etc/hadoop/hdfs-site.xml + cat << EOT >> /usr/local/hadoop/etc/hadoop/hdfs-site.xml + + + dfs.replication + 1 + + + dfs.namenode.name.dir + file:/home/$USER/hadoop_store/hdfs/namenode + + + dfs.datanode.data.dir + file:/home/$USER/hadoop_store/hdfs/datanode + + + ``` + + +## STEP 5: Formatting HDFS +Before we can start using our hadoop cluster, we need to format the HDFS through the namenode. Fomat the HDFS filesystem, answer password prompts if any: - /usr/local/hadoop/bin/hdfs namenode -format +`/usr/local/hadoop/bin/hdfs namenode -format` + + +## STEP 6: Strating Hadoop daemons + +Start HDFS daemon +`/usr/local/hadoop/sbin/start-dfs.sh` + +Start YARN daemon +`/usr/local/hadoop/sbin/start-yarn.sh` + -##STEP 6: Strating Hadoop daemons +## Testing our setup - /usr/local/hadoop/sbin/start-dfs.sh - /usr/local/hadoop/sbin/start-yarn.sh + start-dfs.sh; start-yarn.sh -##Testing our installation -##Troubleshooting + echo "hello world hello Hello" > ~/Downloads/test.txt + + hadoop fs -mkdir /input + hadoop fs -put ~/Downloads/test.txt /input + hadoop jar \ + /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \ + wordcount \ + /input /output + + +## Troubleshooting + +If the manual installation process is not working for you, have a look at this [YouTube video](https://youtu.be/gWkbPVNER5k) or simply use the automated [installation script](https://github.com/user501254/BD_STTP_2016/blob/master/InstallHadoop.sh). diff --git a/index.html b/index.html index 1780277..c4c4fc1 100644 --- a/index.html +++ b/index.html @@ -5,7 +5,8 @@
- {% gist 6567d07166a78db5ef71 BD_STTP_2016.md %} + {% gist 6567d07166a78db5ef71 BD_STTP_2016.md %} +