Java Web 提交job到遠程Hadoop

之前的一篇文章Eclipse編寫MapReduce程序,其實執行是在local。
網上查找資料如何在java web項目內提交job到服務端執行MapReduce的教程,奈何參考網上的資料卻無法成功執行,嘗試了各種錯誤,終于成功,故有此文,以作記錄。

1.創建一個普通的java web項目 命名為 WordCountPage

本文是以wordcount為例,僅作一個拋磚引玉作用。

2代碼

TW3.java

//TW3.java
import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TWC3 {
    public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String valueString = value.toString();

            StringTokenizer itr = new StringTokenizer(valueString);
            while (itr.hasMoreTokens()) {
                word.set(itr.nextToken());
                context.write(word, one);
              }
            
        }
    }

    public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable result = new IntWritable();

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
             int sum = 0;
              for (IntWritable val : values) {
                sum += val.get();
              }
              result.set(sum);
              context.write(key, result);
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);
        job.setJarByClass(TWC3.class);
        job.setJar(args[2]);
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.setInputPaths(job, new Path(args[0]));

        Path output = new Path(args[1]);
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(output)) {
            fs.delete(output, true);
        }
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        // System.exit(job.waitForCompletion(true)?0:1);
        job.waitForCompletion(true);

    }
}

WCServlet.java

//WCServlet.java
import java.io.IOException;
import java.util.Calendar;
import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

/**
 * Servlet implementation class WCServlet
 */
@WebServlet("/WCServlet")
public class WCServlet extends HttpServlet {
    private static final long serialVersionUID = 1L;
       
    /**
     * @see HttpServlet#HttpServlet()
     */
    public WCServlet() {
        super();
    }

    /**
     * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
     */
    protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        response.getWriter().append("Served at: ").append(request.getContextPath());
        String[] args=new String[3];  
         Calendar now = Calendar.getInstance(); 
        args[0]="hdfs://master:9000/input/";
        args[1]="hdfs://master:9000/output/"+ now.getTimeInMillis();

        String a = System.getProperty("catalina.home") +"/lib/userlib/TWC2-3.jar";
//      System.out.println("========a======="+a );
        response.getWriter().append("Served at: " + a);

        args[2] = a;
        try {
            TWC3.main(args);
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();

        }
        
    }

    /**
     * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
     */
    protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        // TODO Auto-generated method stub
        doGet(request, response);
    }

}

index.jsp

<%@ page language="java" contentType="text/html; charset=ISO-8859-1"
    pageEncoding="ISO-8859-1"%>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
<title>Insert title here</title>
</head>
<body>

    <form action="WCServlet" method="post">
        <input type="text" name="keyword"> <br>
        <input type="submit" value="Submit">
    </form>
 
    <hr>
</body>
</html>
簡單代碼解釋:
index.jsp

1.里面一個表單,點擊提交后,會到WCServlet處理相關業務。

WCServlet.java

1.doPost() 函數 調用 doGet()函數。
2.創建一個數組String[] args ,并設置值。args[0]="hdfs://master:9000/input/";為MR程序要處理的數據源地址。args[1]="hdfs://master:9000/output/"+ now.getTimeInMillis(); 為結果保存指定文件夾。
3.由于Hadoop服務器上并沒有你的MR程序,你需要提前上傳。
String a = System.getProperty("catalina.home") +"/lib/userlib/TWC2-3.jar"; 為MR調用的jar文件路徑。
args[2] = a; 把該值存在數組,以被調用。
4.TWC3.main(args); 調用 TWC3 該類的main函數,并且把數組作為參數傳遞過去。

TWC3.java

1.MyMapper 和 MyReducer 分別實現map 和 reduce。
2.main()函數類配置相關信息
3.job.setJar(args[2]); 這個設置指定執行的jar文件,上面提到,服務器端并沒有你自己實現的MR程序,所以得手動打包 成jar文件,提前上傳。

3.給Eclipse 添加相關的文件

related-files.png

這幾個文件可以從hadoop 拷下來(filezilla / xftp等軟件) 并作一些簡單修改。

core-site.xml

<!-- Put site-specific property overrides in this file. -->

<configuration>
    <property>
        <name>fs.default.name</name>
        <value>hdfs://master:9000</value>
    </property>
    <property>
        <name>hadoop.tmp.dir</name>
        <value>file:/home/ubuntu/developer/hadoop-2.7.3/tmp</value>
    </property>
    <property>
        <name>io.file.buffer.size</name>
        <value>131702</value>
    </property>
</configuration>
hdfs-site.xml
<!-- Put site-specific property overrides in this file. -->

<configuration>
    <property>
        <name>dfs.namenode.name.dir</name>
        <value>file:/home/ubuntu/developer/hadoop-2.7.3/hdfs/name</value>
    </property>
    <property>
        <name>dfs.datanode.data.dir</name>
        <value>file:/home/ubuntu/developer/hadoop-2.7.3/hdfs/data</value>
    </property>
    <property>
        <name>dfs.replication</name>
        <value>2</value>
    </property>
    <property>
        <name>dfs.namenode.secondary.http-address</name>
        <value>master:9001</value>
    </property>
    <property>
        <name>dfs.webhdfs.enabled</name>
        <value>true</value>
    </property>
    <property>
        <name>dfs.namenode.datanode.registration.ip-hostname-check</name>
        <value>false</value>
    </property>
    <property>
        <name>dfs.permissions</name>
        <value>false</value>
    </property> 
</configuration>
mapred.site.xml
<!-- Put site-specific property overrides in this file. -->

<configuration>

    <property>
        <name>mapred.remote.os</name>
        <value>Linux</value>
    </property>
     
    <property>
        <name>mapreduce.app-submission.cross-platform</name>
        <value>true</value>
    </property>
    
    <property>
        <name>mapreduce.application.classpath</name>
        <value>/home/ubuntu/developer/hadoop-2.7.3/etc/hadoop,
                /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/common/*,
                /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/common/lib/*,
                /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/hdfs/*,
                /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/hdfs/lib/*,
                /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/mapreduce/*,
                /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/mapreduce/lib/*,
                /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/yarn/*,
                /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/yarn/lib/*
        </value>
    </property>

<!-- ===========================================  -->
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
    <property>
        <name>mapreduce.jobhistory.address</name>
        <value>master:10020</value>
    </property>
    <property>
        <name>mapreduce.jobhistory.webapp.address</name>
        <value>master:19888</value>
    </property>


    <property>  
        <name>mapreduce.map.memory.mb</name>  
        <value>1024</value>  
    </property>  
    <property>  
        <name>mapreduce.reduce.memory.mb</name>  
        <value>1024</value>  
    </property>  
    <property>  
        <name>mapreduce.map.java.opts</name>  
        <value>-Xmx512m</value>  
    </property>  
    <property>  
        <name>mapreduce.reduce.java.opts</name>  
        <value>-Xmx512m</value>  
    </property>  
</configuration>

yarn-site.xml
<configuration>

<!-- Site specific YARN configuration properties -->

<property>
    <name>yarn.application.classpath</name>
    <value>/home/ubuntu/developer/hadoop-2.7.3/etc/hadoop,
        /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/common/*,
        /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/common/lib/*,
        /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/hdfs/*,
        /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/hdfs/lib/*,
        /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/mapreduce/*,
        /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/mapreduce/lib/*,
        /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/yarn/*,
        /home/ubuntu/developer/hadoop-2.7.3/share/hadoop/yarn/lib/*</value>
</property>
<!-- Site specific YARN configuration properties -->
    <property>  
            <description>The hostname of the RM.</description>  
            <name>yarn.resourcemanager.hostname</name>  
            <value>master</value>  
    </property>  

    <property>
      <name>yarn.nodemanager.aux-services</name>
      <value>mapreduce_shuffle</value>
    </property>
    <property>
        <name>yarn.nodemanager.auxservices.mapreduce.shuffle.class</name>
        <value>org.apache.hadoop.mapred.ShuffleHandler</value>
    </property>
    <property>
        <name>yarn.resourcemanager.address</name>
        <value>${yarn.resourcemanager.hostname}:8032</value>
    </property>
    <property>
        <name>yarn.resourcemanager.scheduler.address</name>
        <value>${yarn.resourcemanager.hostname}:8030</value>
    </property>
    <property>
        <name>yarn.resourcemanager.resource-tracker.address</name>
        <value>${yarn.resourcemanager.hostname}:8031</value>
    </property>
    <property>
        <name>yarn.resourcemanager.admin.address</name>
        <value>${yarn.resourcemanager.hostname}:8033</value>
    </property>
    <property>
        <name>yarn.resourcemanager.webapp.address</name>
        <value>${yarn.resourcemanager.hostname}:8088</value>
    </property>

    <property>  
        <description>The https adddress of the RM web application.</description>  
        <name>yarn.resourcemanager.webapp.https.address</name>  
        <value>${yarn.resourcemanager.hostname}:8090</value>  
   </property>

  <property>  
    <description>List of directories to store localized files in. An   
      application's localized file directory will be found in:  
      ${yarn.nodemanager.local-dirs}/usercache/${user}/appcache/application_${appid}.  
      Individual containers' work directories, called container_${contid}, will  
      be subdirectories of this.  
   </description>  
    <name>yarn.nodemanager.local-dirs</name>  
    <value>/data/hadoop/yarn/local</value>  
  </property>  
  
  <property>  
    <description>Whether to enable log aggregation</description>  
    <name>yarn.log-aggregation-enable</name>  
    <value>true</value>  
  </property>  
  
  <property>  
    <description>Where to aggregate logs to.</description>  
    <name>yarn.nodemanager.remote-app-log-dir</name>  
    <value>/data/tmp/logs</value>  
  </property>  
  
  <property>  
    <description>Amount of physical memory, in MB, that can be allocated   
    for containers.</description>  
    <name>yarn.nodemanager.resource.memory-mb</name>  
    <value>2048</value>  
  </property>  
<property>  
    <name>yarn.scheduler.minimum-allocation-mb</name>  
    <value>512</value>  
</property>  
<property>  
    <name>yarn.nodemanager.vmem-pmem-ratio</name>  
    <value>1.0</value>  
</property>  
<property>  
    <name>yarn.nodemanager.vmem-check-enabled</name>  
    <value>false</value>  
</property> 

</configuration>
log4j.properties
log4j.rootLogger=INFO, stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n
log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=target/spring.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n 

4.導出TWC3.java 的 jar 文件

右鍵TWC3.java 文件 -- Export... -- Runnable JAR file

Runnable JAR file.png

指定位置保存

twc3.jar

5.導出項目WordCountPage 為 war 文件

右鍵項目 WordCountPage -- Export -- WAR file

image.png

6.上傳文件到服務器

1.把TWC2-3.jar 上傳到服務器端/home/ubuntu/developer/apache-tomcat-8.5.14/lib/userlib下

twc2-3,jar-to-tomcat

2.把項目WordCountPage.war 上傳到服務器端/home/ubuntu/developer/apache-tomcat-8.5.14/webapps下

WordCountPage.war-to-tomcat.png

7.啟動Tomcat

進入到服務器tomcat 目錄下

bin/startup.sh
startup-tomcat

8.運行測試

瀏覽器打開 http://你的IP:端口/你的項目

image.png
image.png
jobs
image.png
image.png
image.png

可能會遇到的一些問題

permission-denied

解決方法

hdfs dfs -chmod -R 755 /tmp

linux下實時查看tomcat運行日志
1、先切換到:cd tomcat/logs

2、tail -f catalina.out

3、這樣運行時就可以實時查看運行日志了

Ctrl+c 是退出tail命令。

最后編輯于
?著作權歸作者所有,轉載或內容合作請聯系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。

推薦閱讀更多精彩內容

  • Spring Cloud為開發人員提供了快速構建分布式系統中一些常見模式的工具(例如配置管理,服務發現,斷路器,智...
    卡卡羅2017閱讀 134,837評論 18 139
  • Spring Boot 參考指南 介紹 轉載自:https://www.gitbook.com/book/qbgb...
    毛宇鵬閱讀 46,925評論 6 342
  • 一. Java基礎部分.................................................
    wy_sure閱讀 3,832評論 0 11
  • 2017年11月9日周四,一如往常一樣吃完早餐整理完畢駕車上班。剛開出沒5分鐘離開了小區的小馬路駛上了武寧路主干道...
    蛋殼記憶閱讀 198評論 0 3
  • 海子說 好吧 是他寫的詩里說 “給每一條河每一座山取一個溫暖的名字” 那個時候他一定挺溫柔的
    張妙天閱讀 198評論 0 0