Minor Edits

askmrsinh · Jan 24, 2019 · 2877007 · 2877007
1 parent 6ec3735
commit 2877007
Show file tree

Hide file tree

Showing 6 changed files with 292 additions and 216 deletions.
diff --git a/_config.yml b/_config.yml
@@ -1,6 +1,6 @@
 # Dependencies
 markdown:         redcarpet
-highlighter:      pygments
+highlighter:      rouge
 
 # Permalinks
 permalink:        pretty
@@ -15,7 +15,7 @@ baseurl:          /BD_STTP_2016
 
 author:
   name:           'Ashesh Kumar Singh'
-  url:            https://plus.google.com/+AsheshKumar501254
+  url:            https://user501254.github.io
 
 paginate:         5
 
@@ -25,4 +25,4 @@ version:          2.1.0
 github:
   repo:           https://github.com/user501254/BD_STTP_2016
 
-gems: [jekyll-paginate, jekyll-gist]
+plugins: [jekyll-paginate, jekyll-gist]
diff --git a/_includes/head.html b/_includes/head.html
@@ -22,8 +22,9 @@
 
   <!-- Icons -->
   <link rel="apple-touch-icon-precomposed" sizes="144x144" href="{{ site.baseurl }}/public/apple-touch-icon-144-precomposed.png">
-                                 <link rel="shortcut icon" href="{{ site.baseurl }}/public/favicon.ico">
+  <link rel="shortcut icon" href="{{ site.baseurl }}/public/favicon.ico">
 
   <!-- RSS -->
   <link rel="alternate" type="application/rss+xml" title="RSS" href="/atom.xml">
+
 </head>
diff --git a/_layouts/default.html b/_layouts/default.html
@@ -32,24 +32,47 @@
       </script>
       <noscript>Please enable JavaScript to view the <a href="https://disqus.com/?ref_noscript" rel="nofollow">comments powered by Disqus.</a></noscript>
     </div>
-    <script async src="//pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script>
-    <!-- test -->
-    <ins class="adsbygoogle"
-         style="display:block"
-         data-ad-client="ca-pub-2940148750940936"
-         data-ad-slot="9093092606"
-         data-ad-format="auto"></ins>
-    <script>
-    (adsbygoogle = window.adsbygoogle || []).push({});
+    <script type="text/javascript">
+
+        // Passive Event Listeners (coz Lighthouse won't stop complaining)
+        // Test via a getter in the options object to see if the passive property is accessed
+        var supportsPassive = false;
+        try {
+            var opts = Object.defineProperty({}, 'passive', {
+                get: function () {
+                    supportsPassive = true;
+                }
+            });
+            window.addEventListener("testPassive", null, opts);
+            window.removeEventListener("testPassive", null, opts);
+        } catch (e) { }
+
+        // Use our detect's results. passive applied if supported, capture will be false either way.
+        ['wheel', 'mousewheel', 'touchstart', 'touchmove'].forEach(function (e) {
+            document.addEventListener(e, function () { }, supportsPassive ? { passive: true } : false);
+        });
     </script>
+    <script type="text/javascript">
+        WebFontConfig = {
+            google: { families: ['Roboto', 'Roboto+Slab'] }
+        };
+        (function () {
+            var wf = document.createElement('script');
+            wf.src = 'https://cdnjs.cloudflare.com/ajax/libs/webfont/1.6.28/webfontloader.js';
+            wf.type = 'text/javascript';
+            wf.async = 'true';
+            var s = document.getElementsByTagName('script')[0];
+            s.parentNode.insertBefore(wf, s);
+        })();
+    </script>
+    <!-- Global site tag (gtag.js) - Google Analytics -->
+    <script async src="https://www.googletagmanager.com/gtag/js?id=UA-71845746-1"></script>
     <script>
-      (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
-        (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
-        m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
-      })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
-
-      ga('create', 'UA-71845746-1', 'auto');
-      ga('send', 'pageview');
+        window.dataLayer = window.dataLayer || [];
+        function gtag() { dataLayer.push(arguments); }
+        gtag('js', new Date());
+
+        gtag('config', 'UA-71845746-1');
     </script>
 
   </body>

diff --git a/hadoop-multi-node-cluster-installation-guide.md b/hadoop-multi-node-cluster-installation-guide.md
@@ -3,116 +3,135 @@ layout: page
 title: Hadoop Multi-Node Cluster Installation Guide
 ---
 
+## Prerequisites
 
-**Hadoop (2.7.1) Multi-Node cluster configuration**
+Make sure that you have a reliable network without host isolation. Static IP assignment is preferable or at-least have extremely long DHCP lease. Additionally, all nodes (Namenode/master & Datanodes/slaves) should have a common user account with same password; in case you don't, make such user account on all nodes. Having the same username and password on all nodes makes things a bit less complicated.  
+First configure all nodes for single-node cluster. You can use my script that I have posted over [here](https://github.com/user501254/BD_STTP_2016/blob/master/InstallHadoop.sh).
 
-1. Make sure that you have a reliable network without host isolation. Static IP assignment is preferable or at-least have extremely long DHCP lease. Additionally all nodes (Namenode/master & Datanodes/slaves) should have a common user account with same password; in case you don't, make such user account on all nodes. Having same username and password on all nodes makes things a bit less complicated.
-2. *[on all machines]* First configure all nodes for single-node cluster. You can use my script that I have posted over [here](https://github.com/user501254/BD_STTP_2016/blob/master/InstallHadoop.sh).
-3. execute these commands in a new terminal
-
-    *[on all machines]* ↴
+## STEP 1: Stopping Hadoop Daemons and cleaning up HDFS files 
 
-        stop-dfs.sh;stop-yarn.sh;jps
-        rm -rf /tmp/hadoop-$USER
+1. **On all nodes**, confirm that the daemons have stopped by running `jps` command.  
+    `stop-dfs.sh; stop-yarn.sh; rm -rf /tmp/hadoop-$USER`
 
-    *[on Namenode/master only]* ↴
+2. **On Namenode/master only**, remove the datanode directory from HDFS.  
+  `rm -rf ~/hadoop_store/hdfs/datanode`
 
-        rm -rf ~/hadoop_store/hdfs/datanode
+3. **On Datanodes/slaves only**, remove the namenode directory from HDFS.  
+   `rm -rf ~/hadoop_store/hdfs/namenode`
 
-    *[on Datanodes/slaves only]* ↴
 
-        rm -rf ~/hadoop_store/hdfs/namenode
-4. *[on all machines]* Add IP addresses and corresponding Host names for all nodes in the cluster. 
-
-        sudo nano /etc/hosts
+## STEP 2: Configuring connectivity
 
-    hosts
+1. **On all nodes**, add IP addresses and corresponding Hostnames for all nodes in the cluster.  
+    `sudo nano /etc/hosts`
+
+    The `/etc/hosts` file should look somewhat like this after you edit it.
 
         xxx.xxx.xxx.xxx master
         xxx.xxx.xxx.xxy slave1
         xxx.xxx.xxx.xxz slave2
-        # Additionally you may need to remove lines like "xxx.xxx.xxx.xxx localhost", "xxx.xxx.xxx.xxy localhost", "xxx.xxx.xxx.xxz localhost" etc if they exist.
-        # However it's okay keep lines like "127.0.0.1 localhost" and others.
-
-5. *[on all machines]* Configure iptables
 
-    Allow default or custom ports that you plan to use for various Hadoop daemons through the firewall 
+    Additionally you may need to remove lines like 
+    "xxx.xxx.xxx.xxx localhost" etc if they exist.
+    However it's okay keep lines like "127.0.0.1 localhost" and others.
+
+2. **On all nodes**, configure the firewall
+
+    Allow default or custom ports that you plan to use for various Hadoop daemons through the firewall.
 
     OR 
 
-    much easier, disable iptables
-    - on RedHat like distros (Fedora, CentOS)
-
-            sudo systemctl disable firewalld
-            sudo systemctl stop firewalld
-    - on Debian like distros (Ubuntu)
-
-            sudo ufw disable
-
-6. *[on Namenode/master only]* Gain ssh access from Namenode (master) to all Datnodes (slaves).
-
-        ssh-copy-id -i ~/.ssh/id_rsa.pub $USER@slave1
-        ssh-copy-id -i ~/.ssh/id_rsa.pub $USER@slave2
-    confirm things by running `ping slave1`, `ssh slave1`, `ping slave2`, `ssh slave2` etc. You should have a proper response. (Remember to exit each of your ssh sessions by typing `exit` or closing the terminal. To be on the safer side I also made sure that all nodes were able to access each other and not just the Namenode/master.)
-7. *[on all machines]* edit core-site.xml file
-
-        nano /usr/local/hadoop/etc/hadoop/core-site.xml
-   core-site.xml
-
-        <configuration>
-            <property>
-                <name>fs.defaultFS</name>
-                <value>hdfs://master:9000</value>
-                <description>NameNode URI</description>
-            </property>
-        </configuration>
-8. *[on all machines]* edit yarn-site.xml file
-
-        nano /usr/local/hadoop/etc/hadoop/yarn-site.xml
-   yarn-site.xml
-
-        <configuration>
-            <property>
-                <name>yarn.resourcemanager.hostname</name>
-                <value>master</value>
-                <description>The hostname of the RM.</description>
-            </property>
-            <property>
-                 <name>yarn.nodemanager.aux-services</name>
-                 <value>mapreduce_shuffle</value>
-            </property>
-            <property>
-                 <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
-                 <value>org.apache.hadoop.mapred.ShuffleHandler</value>
-            </property>
-        </configuration>
-9. *[on all machines]* modify slaves file, remove the text "localhost" and add slave hostnames
-
-        nano /usr/local/hadoop/etc/hadoop/slaves
-   slaves
-
-        slave1
-        slave2
-    (I guess having this only on Namenode/master will also work but I did this on all machines anyway. Also note that in this configuration master behaves only as resource manger, this is how I intent it to be.)
-10. *[on all machines]* modify hdfs-site.xml file to change the value for property `dfs.replication` to something > 1 (at-least to the number of slaves in the cluster; here I have two slaves so I would set it to 2)
-11. *[on Namenode/master only]* (re)format the HDFS through namenode
-
-        hdfs namenode -format
-12. *[optional]*
-    - remove `dfs.datanode.data.dir` property from master's hdfs-site.xml file.
-    - remove `dfs.namenode.name.dir` property from all slave's hdfs-site.xml file.
-
-
-**TESTING (execute only on Namenode/master)**
-
-    start-dfs.sh;start-yarn.sh
+    Much easier, disable Firewall `iptables` (never on production system).
 
-    echo "hello world hello Hello" > ~/Downloads/test.txt
+    - on RedHat like distros (Fedora, CentOS)  
+      `sudo systemctl stop firewalld; sudo systemctl disable firewalld`
 
-    hadoop fs -mkdir /input
+    - on Debian like distros (Ubuntu)  
+       `sudo ufw disable`
 
-    hadoop fs -put ~/Downloads/test.txt /input
+3. **On Namenode/master only**, gain `ssh` access from Namenode (master) to all Datnodes (slaves).
+
+    `ssh-copy-id -i ~/.ssh/id_rsa.pub $USER@slave1`
+
+    Then confirm connectivity by running `ping slave1`, `ssh slave1`.
+
+    `ssh-copy-id -i ~/.ssh/id_rsa.pub $USER@slave2`
+
+    Then confirm connectivity by running `ping slave2`, `ssh slave2`.
+
+    Make sure taht you get a proper response. 
+    Remember to exit each of your ssh sessions by typing `exit` or closing the terminal. 
+    To be on the safer side, also made sure that all datanodes are also able to access each other.
+
+
+## STEP 3: Editing Configuration Files
+
+1. **On all nodes**, edit Hadoop's `core-site.xml` file
+
+    `nano /usr/local/hadoop/etc/hadoop/core-site.xml`
+
+    ```xml
+    <configuration>
+        <property>
+            <name>fs.defaultFS</name>
+            <value>hdfs://master:9000</value>
+            <description>NameNode URI</description>
+        </property>
+    </configuration>
+    ```
 
-    hadoop jar /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.1.jar wordcount /input /output
+2. **On all nodes**, edit Hadoop's `yarn-site.xml` file
 
-wait for a few seconds and the mapper and reducer should begin.
+    `nano /usr/local/hadoop/etc/hadoop/yarn-site.xml `
+
+    ```xml
+    <configuration>
+        <property>
+            <name>yarn.resourcemanager.hostname</name>
+            <value>master</value>
+            <description>The hostname of the RM.</description>
+        </property>
+        <property>
+             <name>yarn.nodemanager.aux-services</name>
+             <value>mapreduce_shuffle</value>
+        </property>
+        <property>
+             <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
+             <value>org.apache.hadoop.mapred.ShuffleHandler</value>
+        </property>
+    </configuration>
+    ```
+
+7. **On all nodes**, edit Hadoop's `slaves` file, remove the text "localhost" and add slave hostnames
+
+    `nano /usr/local/hadoop/etc/hadoop/slaves`
+
+        slave1's hostname
+        slave2's hostname
+
+    I guess having this only on Namenode/master will also work but I did this on all nodes anyway.
+    Also note that in this configuration master behaves only as resource manger, this is how I intent it to be.
+
+8. **On all nodes**, modify `hdfs-site.xml` file to change the value for property `dfs.replication` to something > 1. It should be equal to at-least the number of slaves in the cluster; here I have two slaves so I would set it to 2.
+
+9. **On Namenode/master only**, (re)format the HDFS through namenode
+
+    `hdfs namenode -format`
+
+10. **Optional**
+    - remove `dfs.datanode.data.dir` property from master's `hdfs-site.xml` file.
+    - remove `dfs.namenode.name.dir` property from all slave's `hdfs-site.xml` file.
+
+
+## Testing our setup (execute only on Namenode/master)
+
+    start-dfs.sh; start-yarn.sh
+
+    echo "hello world hello Hello" > ~/Downloads/test.txt
+
+    hadoop fs -mkdir /input
+    hadoop fs -put ~/Downloads/test.txt /input
+    hadoop jar \
+      /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar \
+      wordcount \
+      /input /output