美文网首页D3
Python词频统计,D3.js word cloud生成,V3

Python词频统计,D3.js word cloud生成,V3

作者: Kaidi_G | 来源:发表于2017-07-19 04:42 被阅读1024次
    词云的生成

    词频计算部分


    原数据从mysql中获取. 我要统计返回行tag属性中包含的tag词频。

    sql_tags_all = "select tags from wbdata;"
    cursor.execute(sql_tags_all)
    sql_tags_all_result = cursor.fetchall()
    

    返回数据的每一个row都是一个微博对应的5个tag,由空格分开.
    接下来把查询到的所有行的tag合并

    def turn_tags_tostring(sql_result):
        outputstring = ""
        for row in sql_result:
            longlist = row[0].split(" ")
            for i in range(len(longlist)-1):
                outputstring = outputstring + longlist[i+1] +","
        return outputstring
    

    接下来是对输出的这个string进行词频统计.
    这里涉及到后面词云的生成,因此输出上有规范.
    已知目标输出格式为一个list,包含了所有出现在词云中的词语,每个词语是一个字典,分别为文字部分和大小,样例为:
    [{"text":"德国","size":120},{"text":"motion","size":15},{"text":"forces","size":10}]
    因此,在统计完词频后,要将频率转化为size.
    在这里,词云词语的大小范围定位10到120.
    线性转化方法:

    def linear_scale(inputmin,inputmax,outputmin,outputmax,item):
        a = float(outputmax-outputmin)/float(inputmax-inputmin)
        b = outputmax - a*inputmax
        output = a*item +b
        return output
    

    接下来就是输入sting和希望取的前多少位词语,(下面的代码块儿都为一个函数)

    def wordscounter(text, n):
        wordDict = {}
        wordlist =text.split(",")
        for word in wordlist:
            if word in wordDict:
                wordDict[word] = wordDict[word] + 1
            else:
                wordDict[word] = 1
    

    然后加入了无关词语过滤,手动去掉高频无关词汇:

    
        removelist = ["秒拍", "视频", "网页", "分享", "全文", "链接", "00", "01","1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11",
                      "12", "13", "14", "15","16","17","18","19","20", "21", "22", "23", "24", "25", "26", "27", "28","29", "30","31","100", "20",  "40", "50",
                      "60", "70", "80","90"]//这里添加一个list去掉无关但是频率较高的词语
        for word in removelist:
            try:
                del wordDict[word.decode("utf-8")]
                print "delet", word
            except Exception:
                pass
    

    接下来使用counter方法来进行词频统计,并取得前n位最频繁的词语:

        count = Counter(wordDict)
        rank = count.most_common()[:n]
    

    接下来要使用最开始定义的线性变换方法来计算每个单词的大小了,
    因为会有出现数据量小的时候,词频最高的第一位由于是关键词把第二位甩得老远,做出来的词云没有层次不好看,所以在这里,我设定词频最高的第一个词大小恒定位120,从第二名开始到第120名再从110到10进行线性变换.

        countmax = rank[1][1]   //取得最大词频
        countmin = rank[-1][1]  //取得最小词频
        diclist = []
        for item in rank:
            rankdic = {}
            rankdic['text'] = item[0]  //设定文字部分
            rankdic['size'] = functions.linear_scale(countmin,countmax,10,110,item[1])
            diclist.append(rankdic) //调用线性变换方法根据词频得到每个单词的大小
        diclist[0]['size'] = 120  // 把首词大小设为120
        print json.dumps(diclist, encoding="UTF-8", ensure_ascii=False)   //打印中文检查结果
    

    词云部分


    包含以下script

      <script src="//d3js.org/d3.v3.min.js"></script>
      <script src="//d3js.org/topojson.v1.min.js"></script>
      <script src="./static/lib/js/d3.layout.cloud.js"></script>
    

    d3.layout.cloud.js在文章最后给出

    frequency_list就是上面返回的格式为[{"text":"text", "size":120},{"text":"text", "size":100}]的列表.

       //获得数据
       var frequency_list ={{frequency_list}};
    
       //设定一个线性非连贯比例尺来进行给不同大小的词赋颜色.
       var color = d3.scale.linear()
                .domain([0,1,2,3,4,5,6,10,15,20,100])
                .range(["#ddd", "#ccc", "#bbb", "#aaa", "#999", "#888", "#777", "#666", "#555", "#444", "#333", "#222"]); 
        
        d3.layout.cloud().size([800, 300])
                .words(frequency_list)
                .rotate(0)
                .fontSize(function(d) { return d.size; })
                .on("end", draw)
                .start();
    
        function draw(words) {
            d3.select("#word_cloud").append("svg")//根据id选择父对象插入svg
                    .attr("width", "100%")
                    .attr("height", "100%")
                    .attr("viewBox","0 0 900 400")
                    .attr("style", "border: 1px solid black")
                    .attr("preserveAspectRatio","xMaxYMax meet")
                    .attr("class", "wordcloud")
                    .append("g")
                    .attr("transform", "translate(400,200)")
                    .selectAll("text")
                    .data(words)
                    .enter().append("text")
                    .style("font-size", function(d) { return d.size + "px"; })
                    .style("fill", function(d, i) { return color(i); })
                    .attr("transform", function(d) {
                        return "translate(" + [d.x, d.y] + ")rotate(" + d.rotate + ")";
                    })
                    .text(function(d) { return d.text; });
        }
    

    到此,首图上的词云就应该可以实现啦~
    :)


    d3.layout.cloud.js

    // Word cloud layout by Jason Davies, http://www.jasondavies.com/word-cloud/
    // Algorithm due to Jonathan Feinberg, http://static.mrfeinberg.com/bv_ch03.pdf
    (function(exports) {
        function cloud() {
            var size = [256, 256],
                text = cloudText,
                font = cloudFont,
                fontSize = cloudFontSize,
                fontStyle = cloudFontNormal,
                fontWeight = cloudFontNormal,
                rotate = cloudRotate,
                padding = cloudPadding,
                spiral = archimedeanSpiral,
                words = [],
                timeInterval = Infinity,
                event = d3.dispatch("word", "end"),
                timer = null,
                cloud = {};
    
            cloud.start = function() {
                var board = zeroArray((size[0] >> 5) * size[1]),
                    bounds = null,
                    n = words.length,
                    i = -1,
                    tags = [],
                    data = words.map(function(d, i) {
                        d.text = text.call(this, d, i);
                        d.font = font.call(this, d, i);
                        d.style = fontStyle.call(this, d, i);
                        d.weight = fontWeight.call(this, d, i);
                        d.rotate = rotate.call(this, d, i);
                        d.size = ~~fontSize.call(this, d, i);
                        d.padding = padding.call(this, d, i);
                        return d;
                    }).sort(function(a, b) { return b.size - a.size; });
    
                if (timer) clearInterval(timer);
                timer = setInterval(step, 0);
                step();
    
                return cloud;
    
                function step() {
                    var start = +new Date,
                        d;
                    while (+new Date - start < timeInterval && ++i < n && timer) {
                        d = data[i];
                        d.x = (size[0] * (Math.random() + .5)) >> 1;
                        d.y = (size[1] * (Math.random() + .5)) >> 1;
                        cloudSprite(d, data, i);
                        if (d.hasText && place(board, d, bounds)) {
                            tags.push(d);
                            event.word(d);
                            if (bounds) cloudBounds(bounds, d);
                            else bounds = [{x: d.x + d.x0, y: d.y + d.y0}, {x: d.x + d.x1, y: d.y + d.y1}];
                            // Temporary hack
                            d.x -= size[0] >> 1;
                            d.y -= size[1] >> 1;
                        }
                    }
                    if (i >= n) {
                        cloud.stop();
                        event.end(tags, bounds);
                    }
                }
            }
    
            cloud.stop = function() {
                if (timer) {
                    clearInterval(timer);
                    timer = null;
                }
                return cloud;
            };
    
            cloud.timeInterval = function(x) {
                if (!arguments.length) return timeInterval;
                timeInterval = x == null ? Infinity : x;
                return cloud;
            };
    
            function place(board, tag, bounds) {
                var perimeter = [{x: 0, y: 0}, {x: size[0], y: size[1]}],
                    startX = tag.x,
                    startY = tag.y,
                    maxDelta = Math.sqrt(size[0] * size[0] + size[1] * size[1]),
                    s = spiral(size),
                    dt = Math.random() < .5 ? 1 : -1,
                    t = -dt,
                    dxdy,
                    dx,
                    dy;
    
                while (dxdy = s(t += dt)) {
                    dx = ~~dxdy[0];
                    dy = ~~dxdy[1];
    
                    if (Math.min(dx, dy) > maxDelta) break;
    
                    tag.x = startX + dx;
                    tag.y = startY + dy;
    
                    if (tag.x + tag.x0 < 0 || tag.y + tag.y0 < 0 ||
                        tag.x + tag.x1 > size[0] || tag.y + tag.y1 > size[1]) continue;
                    // TODO only check for collisions within current bounds.
                    if (!bounds || !cloudCollide(tag, board, size[0])) {
                        if (!bounds || collideRects(tag, bounds)) {
                            var sprite = tag.sprite,
                                w = tag.width >> 5,
                                sw = size[0] >> 5,
                                lx = tag.x - (w << 4),
                                sx = lx & 0x7f,
                                msx = 32 - sx,
                                h = tag.y1 - tag.y0,
                                x = (tag.y + tag.y0) * sw + (lx >> 5),
                                last;
                            for (var j = 0; j < h; j++) {
                                last = 0;
                                for (var i = 0; i <= w; i++) {
                                    board[x + i] |= (last << msx) | (i < w ? (last = sprite[j * w + i]) >>> sx : 0);
                                }
                                x += sw;
                            }
                            delete tag.sprite;
                            return true;
                        }
                    }
                }
                return false;
            }
    
            cloud.words = function(x) {
                if (!arguments.length) return words;
                words = x;
                return cloud;
            };
    
            cloud.size = function(x) {
                if (!arguments.length) return size;
                size = [+x[0], +x[1]];
                return cloud;
            };
    
            cloud.font = function(x) {
                if (!arguments.length) return font;
                font = d3.functor(x);
                return cloud;
            };
    
            cloud.fontStyle = function(x) {
                if (!arguments.length) return fontStyle;
                fontStyle = d3.functor(x);
                return cloud;
            };
    
            cloud.fontWeight = function(x) {
                if (!arguments.length) return fontWeight;
                fontWeight = d3.functor(x);
                return cloud;
            };
    
            cloud.rotate = function(x) {
                if (!arguments.length) return rotate;
                rotate = d3.functor(x);
                return cloud;
            };
    
            cloud.text = function(x) {
                if (!arguments.length) return text;
                text = d3.functor(x);
                return cloud;
            };
    
            cloud.spiral = function(x) {
                if (!arguments.length) return spiral;
                spiral = spirals[x + ""] || x;
                return cloud;
            };
    
            cloud.fontSize = function(x) {
                if (!arguments.length) return fontSize;
                fontSize = d3.functor(x);
                return cloud;
            };
    
            cloud.padding = function(x) {
                if (!arguments.length) return padding;
                padding = d3.functor(x);
                return cloud;
            };
    
            return d3.rebind(cloud, event, "on");
        }
    
        function cloudText(d) {
            return d.text;
        }
    
        function cloudFont() {
            return "serif";
        }
    
        function cloudFontNormal() {
            return "normal";
        }
    
        function cloudFontSize(d) {
            return Math.sqrt(d.value);
        }
    
        function cloudRotate() {
            return (~~(Math.random() * 6) - 3) * 30;
        }
    
        function cloudPadding() {
            return 1;
        }
    
        // Fetches a monochrome sprite bitmap for the specified text.
        // Load in batches for speed.
        function cloudSprite(d, data, di) {
            if (d.sprite) return;
            c.clearRect(0, 0, (cw << 5) / ratio, ch / ratio);
            var x = 0,
                y = 0,
                maxh = 0,
                n = data.length;
            --di;
            while (++di < n) {
                d = data[di];
                c.save();
                c.font = d.style + " " + d.weight + " " + ~~((d.size + 1) / ratio) + "px " + d.font;
                var w = c.measureText(d.text + "m").width * ratio,
                    h = d.size << 1;
                if (d.rotate) {
                    var sr = Math.sin(d.rotate * cloudRadians),
                        cr = Math.cos(d.rotate * cloudRadians),
                        wcr = w * cr,
                        wsr = w * sr,
                        hcr = h * cr,
                        hsr = h * sr;
                    w = (Math.max(Math.abs(wcr + hsr), Math.abs(wcr - hsr)) + 0x1f) >> 5 << 5;
                    h = ~~Math.max(Math.abs(wsr + hcr), Math.abs(wsr - hcr));
                } else {
                    w = (w + 0x1f) >> 5 << 5;
                }
                if (h > maxh) maxh = h;
                if (x + w >= (cw << 5)) {
                    x = 0;
                    y += maxh;
                    maxh = 0;
                }
                if (y + h >= ch) break;
                c.translate((x + (w >> 1)) / ratio, (y + (h >> 1)) / ratio);
                if (d.rotate) c.rotate(d.rotate * cloudRadians);
                c.fillText(d.text, 0, 0);
                if (d.padding) c.lineWidth = 2 * d.padding, c.strokeText(d.text, 0, 0);
                c.restore();
                d.width = w;
                d.height = h;
                d.xoff = x;
                d.yoff = y;
                d.x1 = w >> 1;
                d.y1 = h >> 1;
                d.x0 = -d.x1;
                d.y0 = -d.y1;
                d.hasText = true;
                x += w;
            }
            var pixels = c.getImageData(0, 0, (cw << 5) / ratio, ch / ratio).data,
                sprite = [];
            while (--di >= 0) {
                d = data[di];
                if (!d.hasText) continue;
                var w = d.width,
                    w32 = w >> 5,
                    h = d.y1 - d.y0;
                // Zero the buffer
                for (var i = 0; i < h * w32; i++) sprite[i] = 0;
                x = d.xoff;
                if (x == null) return;
                y = d.yoff;
                var seen = 0,
                    seenRow = -1;
                for (var j = 0; j < h; j++) {
                    for (var i = 0; i < w; i++) {
                        var k = w32 * j + (i >> 5),
                            m = pixels[((y + j) * (cw << 5) + (x + i)) << 2] ? 1 << (31 - (i % 32)) : 0;
                        sprite[k] |= m;
                        seen |= m;
                    }
                    if (seen) seenRow = j;
                    else {
                        d.y0++;
                        h--;
                        j--;
                        y++;
                    }
                }
                d.y1 = d.y0 + seenRow;
                d.sprite = sprite.slice(0, (d.y1 - d.y0) * w32);
            }
        }
    
        // Use mask-based collision detection.
        function cloudCollide(tag, board, sw) {
            sw >>= 5;
            var sprite = tag.sprite,
                w = tag.width >> 5,
                lx = tag.x - (w << 4),
                sx = lx & 0x7f,
                msx = 32 - sx,
                h = tag.y1 - tag.y0,
                x = (tag.y + tag.y0) * sw + (lx >> 5),
                last;
            for (var j = 0; j < h; j++) {
                last = 0;
                for (var i = 0; i <= w; i++) {
                    if (((last << msx) | (i < w ? (last = sprite[j * w + i]) >>> sx : 0))
                        & board[x + i]) return true;
                }
                x += sw;
            }
            return false;
        }
    
        function cloudBounds(bounds, d) {
            var b0 = bounds[0],
                b1 = bounds[1];
            if (d.x + d.x0 < b0.x) b0.x = d.x + d.x0;
            if (d.y + d.y0 < b0.y) b0.y = d.y + d.y0;
            if (d.x + d.x1 > b1.x) b1.x = d.x + d.x1;
            if (d.y + d.y1 > b1.y) b1.y = d.y + d.y1;
        }
    
        function collideRects(a, b) {
            return a.x + a.x1 > b[0].x && a.x + a.x0 < b[1].x && a.y + a.y1 > b[0].y && a.y + a.y0 < b[1].y;
        }
    
        function archimedeanSpiral(size) {
            var e = size[0] / size[1];
            return function(t) {
                return [e * (t *= .1) * Math.cos(t), t * Math.sin(t)];
            };
        }
    
        function rectangularSpiral(size) {
            var dy = 4,
                dx = dy * size[0] / size[1],
                x = 0,
                y = 0;
            return function(t) {
                var sign = t < 0 ? -1 : 1;
                // See triangular numbers: T_n = n * (n + 1) / 2.
                switch ((Math.sqrt(1 + 4 * sign * t) - sign) & 3) {
                    case 0:  x += dx; break;
                    case 1:  y += dy; break;
                    case 2:  x -= dx; break;
                    default: y -= dy; break;
                }
                return [x, y];
            };
        }
    
        // TODO reuse arrays?
        function zeroArray(n) {
            var a = [],
                i = -1;
            while (++i < n) a[i] = 0;
            return a;
        }
    
        var cloudRadians = Math.PI / 180,
            cw = 1 << 11 >> 5,
            ch = 1 << 11,
            canvas,
            ratio = 1;
    
        if (typeof document !== "undefined") {
            canvas = document.createElement("canvas");
            canvas.width = 1;
            canvas.height = 1;
            ratio = Math.sqrt(canvas.getContext("2d").getImageData(0, 0, 1, 1).data.length >> 2);
            canvas.width = (cw << 5) / ratio;
            canvas.height = ch / ratio;
        } else {
            // node-canvas support
            var Canvas = require("canvas");
            canvas = new Canvas(cw << 5, ch);
        }
    
        var c = canvas.getContext("2d"),
            spirals = {
                archimedean: archimedeanSpiral,
                rectangular: rectangularSpiral
            };
        c.fillStyle = c.strokeStyle = "red";
        c.textAlign = "center";
    
        exports.cloud = cloud;
    })(typeof exports === "undefined" ? d3.layout || (d3.layout = {}) : exports);
    

    相关文章

      网友评论

      • 493fbd74fdbf:d3js的输入和python的输入是怎么对接的呀?
        Kaidi_G:在JS页写 var frequency_list ={{frequency_list}};
        后面双括号里面就是在Py里面你最后得到的格式为[{"text":"text", "size":120},{"text":"text", "size":100}]的列表。
        可参考 https://my.oschina.net/tinyhare/blog/756485

      本文标题:Python词频统计,D3.js word cloud生成,V3

      本文链接:https://www.haomeiwen.com/subject/aumkkxtx.html