作者: 勤一 | 来源:发表于2016-08-27 17:00 被阅读0次

           工作一年多了,一直在断断续续的学习使用 Hadoop,它提供的 Map 和 Reduce 数据处理引擎能够帮助我们方便的处理大数据集,HDFS 分布式文件系统,可以帮助我们冗余的存储大数据集,这么好的一门技术,应该是要好好学习的。
            最近一直在看 <MapReduce设计模式> 一书,里面介绍了许多问题的解决办法,总觉得读书应该记些笔记,这里就写点什么吧!

           首先,需要一个数据集用于以后的数据处理算法做准备:

    1. 这个数据集包含10000行数据,其中的每一行都是 json 字符串
    2. 每个 json 字符串中包含一个用户的四个基本信息(id, name, sex, age)

           接来下,生成这个数据集:

    package hadoop_design.mock_user_info;
    
    /**
     * User 基本信息
     * Created by zhanghu on 16/8/27.
     */
    public class UserBean {
    
        private String userName;
        private int age;
        private int sex;        // 男性是1, 女性是0
        private String id;  // md5(userName + age + sex)
    
        public String getUserName() {
            return userName;
        }
    
        public void setUserName(String userName) {
            this.userName = userName;
        }
    
        public int getAge() {
            return age;
        }
    
        public void setAge(int age) {
            this.age = age;
        }
    
        public int getSex() {
            return sex;
        }
    
        public void setSex(int sex) {
            this.sex = sex;
        }
    
        public String getId() {
            return id;
        }
    
        public void setId(String userId) {
            this.id = userId;
        }
    }
    
    package hadoop_design.mock_user_info;
    
    import net.sf.json.JSONObject;
    
    /**
     * Json 工具类
     * Created by zhanghu on 16/8/27.
     */
    public class JsonUtils {
    
        public static String objectToJsonString(Object object) {
            JSONObject json = JSONObject.fromObject(object);
            return json.toString();
        }
    }
    
    package hadoop_design.mock_user_info;
    
    import java.util.Random;
    
    /**
     * 随机器
     * 引用 : http://www.cnblogs.com/dongliyang/archive/2013/04/01/2994554.html
     * Created by zhanghu on 16/8/27.
     */
    public final class StdRandom {
    
        //随机数生成器
        private static Random random;
        //种子值
        private static long seed;
    
        //静态代码块,初始化种子值及随机数生成器
        static {
            seed = System.currentTimeMillis();
            random = new Random(seed);
        }
    
        //私有构造函数,禁止实例化
        private StdRandom() {}
    
        /**
         * 设置种子值
         * @param s 随机数生成器的种子值
         */
        public static void setSeed(long s){
            seed = s;
            random = new Random(seed);
        }
    
        /**
         * 获取种子值
         * @return long 随机数生成器的种子值
         */
        public static long getSeed(){
            return seed;
        }
    
        /**
         * 随机返回0到1之间的实数 [0,1)
         * @return double 随机数
         */
        public static double uniform(){
            return random.nextDouble();
        }
    
        /**
         * 随机返回0到N-1之间的整数 [0,N)
         * @param N 上限
         * @return int 随机数
         */
        public static int uniform(int N){
            return random.nextInt(N);
        }
    
        /**
         * 随机返回0到1之间的实数 [0,1)
         * @return double 随机数
         */
        public static double random(){
            return uniform();
        }
    
        /**
         * 随机返回a到b-1之间的整数 [a,b)
         * @param a 下限
         * @param b 上限
         * @return int 随机数
         */
        public static int uniform(int a,int b){
            return a + uniform(b - a);
        }
    
        /**
         * 随机返回a到b之间的实数
         * @param a 下限
         * @param b 上限
         * @return double 随机数
         */
        public static double uniform(double a,double b){
            return a + uniform() * (b - a);
        }
    }
    
    package hadoop_design.mock_user_info;
    
    /**
     * String 对象的一些工具类
     * Created by zhanghu on 16/8/27.
     */
    public class StringUtils {
    
        /**
         * 返回随机字符串,同时包含数字、大小写字母
         * @param len 字符串长度,不能小于3
         * @return String 随机字符串
         */
        public static String randomStr(int len){
    
            if(len < 3){
                throw new IllegalArgumentException("字符串长度不能小于3");
            }
    
            //数组,用于存放随机字符
            char[] chArr = new char[len];
    
            //为了保证必须包含数字、大小写字母
            chArr[0] = (char)('0' + StdRandom.uniform(0,10));
            chArr[1] = (char)('A' + StdRandom.uniform(0,26));
            chArr[2] = (char)('a' + StdRandom.uniform(0,26));
    
    
            char[] codes = { '0','1','2','3','4','5','6','7','8','9',
                    'A','B','C','D','E','F','G','H','I','J',
                    'K','L','M','N','O','P','Q','R','S','T',
                    'U','V','W','X','Y','Z','a','b','c','d',
                    'e','f','g','h','i','j','k','l','m','n',
                    'o','p','q','r','s','t','u','v','w','x',
                    'y','z'};
    
            //charArr[3..len-1]随机生成codes中的字符
            for(int i = 3; i < len; i++){
                chArr[i] = codes[StdRandom.uniform(0,codes.length)];
            }
    
            //将数组chArr随机排序
            for(int i = 0; i < len; i++){
                int r = i + StdRandom.uniform(len - i);
                char temp = chArr[i];
                chArr[i] = chArr[r];
                chArr[r] = temp;
            }
    
            return new String(chArr);
        }
    }
    
    package hadoop_design.mock_user_info;
    
    import org.apache.commons.codec.digest.DigestUtils;
    
    import java.io.BufferedWriter;
    import java.io.File;
    import java.io.FileWriter;
    import java.io.IOException;
    
    /** 主程序代码
     * Created by zhanghu on 16/8/27.
     */
    public class Main {
    
        private static String generateData() {
            UserBean userBean = new UserBean();
            userBean.setUserName(StringUtils.randomStr(StdRandom.uniform(10, 21)));
            userBean.setAge(StdRandom.uniform(0, 100));
            userBean.setSex(StdRandom.uniform(0, 2));
    
            String md5Bean = userBean.getUserName() + userBean.getAge() + userBean.getSex();
            userBean.setId(DigestUtils.md5Hex(md5Bean));
            return JsonUtils.objectToJsonString(userBean);
        }
    
        public static void main(String[] args) throws IOException {
    
            File file = new File("user.data");
            BufferedWriter out = new BufferedWriter(new FileWriter((file)));
    
            for (int i = 0; i != 10000; ++i) {
                if (i % 1000 == 0) {
                    System.out.println("mock data line : " + i);
                }
                out.write(generateData());
                out.newLine();
            }
    
            out.flush();
            out.close();
        }
    }
    

            OK, 利用上面的程序,我得到了一个包含 10000 行用户信息的 json 文本行,类似于下面这样:

    {"age":48,"id":"7a8bd2dc862f8ce972292474f2f3bc56","sex":1,"userName":"dHI3w56HNTiQh"}
    {"age":18,"id":"fbcf2df050aa2da3c678dcb0a02bda2d","sex":1,"userName":"Xh7mU53Ba7JZ"}
    {"age":70,"id":"4808f32ecbbe21b93882bb44973e7bea","sex":1,"userName":"aLV5E156YdJ"}
    {"age":57,"id":"a9863ef325a6ca91f2554e8f4874d424","sex":1,"userName":"CwQ43w548IS"}
    {"age":71,"id":"e4e7724632feefc514902d0849a86d6b","sex":1,"userName":"sz9hcdCZnkVXC3x"}
    {"age":26,"id":"25ae8b3ab30f11a267939fec7177f829","sex":1,"userName":"cA17IpnzzPFMv4"}
    {"age":58,"id":"52a7cb852583fe183d300de9c6de0efa","sex":1,"userName":"9b8v3HFIaNqsIyC2a97"}
    {"age":55,"id":"71fd9c057f2c60b99021ce8a353c5cb0","sex":0,"userName":"9OqAJlyZVKgpV"}
    {"age":25,"id":"6b758fb1a4e78930ea919074d5abb172","sex":0,"userName":"OvHcn61daoXTu"}
    {"age":90,"id":"02883e46b66fd848075401ae205b0896","sex":0,"userName":"4kyHB1s5v6nQ049"}
    

            下面,开始使用这些数据吧!

    相关文章

      网友评论

          本文标题:

          本文链接:https://www.haomeiwen.com/subject/oiqzsttx.html