lemon 代码分析之文本解析

作者: help_youself | 来源:发表于2020-02-09 16:30 被阅读0次

lemon 代码分析之文本解析
【柳珊·英语公众号学习笔记2018年6月11日Day2】
用了那么多在线词云，终于发现了超好用的词云工具！
词法分析器
简书文章爬虫解析
v-html中使用v-viewer
源码成神之路：Spring Framework源代码解析之IOC
javascript项目上很难用到的基础知识
模板
macOS SwiftUI 实战项目SRT转文本之01 实现文本

lemon主要用于处理图论相关的算法，最短路径，最大流最小割。lemon代码库中大量使用模板，c++语言的black magic。躲病毒期间，对代码进行了阅读，将自己的理解记录在这里。
相关代码摘取自lemon中的dijkstra_test.cc。
图的表示方式：

char test_lgf[] =
  "@nodes\n"
  "label\n"
  "0\n"
  "1\n"
  "2\n"
  "3\n"
  "4\n"
  "@arcs\n"
  "     label length\n"
  "0 1  0     1\n"
  "1 2  1     1\n"
  "2 3  2     1\n"
  "0 3  4     5\n"
  "0 3  5     10\n"
  "0 3  6     7\n"
  "4 2  7     1\n"
  "@attributes\n"
  "source 0\n"
  "target 3\n";

阅读代码的目标，上述文本中表示edge长度的数据，是怎么解析到相应的数据结构中的？即下述run函数，怎么将数据处理到LengthMap中。

template <class Digraph>
void checkDijkstra() {
  LengthMap length(G);

  std::istringstream input(test_lgf);
  digraphReader(G, input).
    arcMap("length", length).
    node("source", s).
    node("target", t).
    run();
}

文本中的"@arcs\n"一下的内容会在readArcs函数中处理。这里的map_num就是表示的是" label length\n"这句话中token的个数。

void readArcs() {
      while (readLine() && line >> c && c != '@') {
        line.putback(c);

        std::string source_token;
        std::string target_token;

//  the content in line,e,g: "0 1  0     1\n"
        if (!_reader_bits::readToken(line, source_token))
          throw FormatError("Source not found");

        if (!_reader_bits::readToken(line, target_token))
          throw FormatError("Target not found");

        std::vector<std::string> tokens(map_num);
        for (int i = 0; i < map_num; ++i) {
          if (!_reader_bits::readToken(line, tokens[i])) {
            std::ostringstream msg;
            msg << "Column not found (" << i + 1 << ")";
            throw FormatError(msg.str());
          }
        }
        if (line >> std::ws >> c)
          throw FormatError("Extra character at the end of line");

        Arc a;
        if (!_use_arcs) {

          typename NodeIndex::iterator it;

          it = _node_index.find(source_token);
          if (it == _node_index.end()) {
            std::ostringstream msg;
            msg << "Item not found: " << source_token;
            throw FormatError(msg.str());
          }
          Node source = it->second;

          it = _node_index.find(target_token);
          if (it == _node_index.end()) {
            std::ostringstream msg;
            msg << "Item not found: " << target_token;
            throw FormatError(msg.str());
          }
          Node target = it->second;

          a = _digraph.addArc(source, target);
          if (label_index != -1)
            _arc_index.insert(std::make_pair(tokens[label_index], a));
        } else {
          if (label_index == -1)
            throw FormatError("Label map not found");
          typename std::map<std::string, Arc>::iterator it =
            _arc_index.find(tokens[label_index]);
          if (it == _arc_index.end()) {
            std::ostringstream msg;
            msg << "Arc with label not found: " << tokens[label_index];
            throw FormatError(msg.str());
          }
          a = it->second;
        }

        for (int i = 0; i < static_cast<int>(_arc_maps.size()); ++i) {
          _arc_maps[i].second->set(a, tokens[map_index[i]]);
        }

      }
      if (readSuccess()) {
        line.putback(c);
      }
}

_arc_maps[i].second->set(a, tokens[map_index[i]])对arc与长度进行处理。_arc_maps就是在arcMap("length", length)注册的。

    template <typename Map>
    DigraphReader& arcMap(const std::string& caption, Map& map) {
      checkConcept<concepts::WriteMap<Arc, typename Map::Value>, Map>();
      _reader_bits::MapStorageBase<Arc>* storage =
        new _reader_bits::MapStorage<Arc, Map>(map);
      _arc_maps.push_back(std::make_pair(caption, storage));
      return *this;
    }

MapStorage对象中对set函数进行追踪。

    template <typename _Item, typename _Map,
              typename _Converter = DefaultConverter<typename _Map::Value> >
    class MapStorage : public MapStorageBase<_Item> {
    public:
      virtual void set(const Item& item ,const std::string& value) {
        _map.set(_graph.direct(item, dir), _converter(value));
      }
}

追踪LengthMap中的set函数。LengthMap的定义在lemon/bits/graph_extender.h

  template <typename _Value>
  class ArcMap
     : public MapExtender<DefaultMap<Graph, Arc, _Value> > 
  template <typename _Graph, typename _Item, typename _Value>
  class DefaultMap
    : public DefaultMapSelector<_Graph, _Item, _Value>::Map
  template <typename _Graph, typename _Item, typename _Value>
  struct DefaultMapSelector {
    typedef ArrayMap<_Graph, _Item, _Value> Map;
  };

  template <typename _Graph, typename _Item, typename _Value>
  class ArrayMap
    : public ItemSetTraits<_Graph, _Item>::ItemNotifier::ObserverBase {
  public:
    const Value& operator[](const Key& key) const {
      int id = Parent::notifier()->id(key);
      return values[id];
    }
    void set(const Key& key, const Value& val) {
      (*this)[key] = val;
    }
private:
Value* values;// when this is allocated?
};

第二个问题出现，Value* values的内存空间是何时分配的？readArcs中的这句话a = _digraph.addArc(source, target)会通知(notifier)observer进行内存分配。这就是ArrayMap继承ObserverBase的作用。

  class ListDigraph : public ExtendedListDigraphBase {
public:
    Arc addArc(Node s, Node t) {
      return Parent::addArc(s, t);
    }
   };
  template <typename Base>
  class DigraphExtender : public Base {
public:
    Arc addArc(const Node& from, const Node& to) {
      Arc arc = Parent::addArc(from, to);
      notifier(Arc()).add(arc);
      return arc;
    }
typedef AlterationNotifier<DigraphExtender, Arc> ArcNotifier;
    ArcNotifier& notifier(Arc) const {
      return arc_notifier;
    }
};
  template <typename _Container, typename _Item>
  class AlterationNotifier {
  public:
   void add(const Item& item) {
      typename Observers::reverse_iterator it;
      try {
        for (it = _observers.rbegin(); it != _observers.rend(); ++it) {
          (*it)->add(item);
        }
      } catch (...) {
        typename Observers::iterator jt;
        for (jt = it.base(); jt != _observers.end(); ++jt) {
          (*jt)->erase(item);
        }
        throw;
      }
    }
};

ArrayMap中的add函数进行内存分配。

  template <typename _Graph, typename _Item, typename _Value>
  class ArrayMap
    : public ItemSetTraits<_Graph, _Item>::ItemNotifier::ObserverBase {
  protected:

    // \brief Adds a new key to the map.
    //
    // It adds a new key to the map. It is called by the observer notifier
    // and it overrides the add() member function of the observer base.
    virtual void add(const Key& key) {
      Notifier* nf = Parent::notifier();
      int id = nf->id(key);
      if (id >= capacity) {
        int new_capacity = (capacity == 0 ? 1 : capacity);
        while (new_capacity <= id) {
          new_capacity <<= 1;
        }
        Value* new_values = allocator.allocate(new_capacity);
        Item it;
        for (nf->first(it); it != INVALID; nf->next(it)) {
          int jd = nf->id(it);;
          if (id != jd) {
            allocator.construct(&(new_values[jd]), values[jd]);
            allocator.destroy(&(values[jd]));
          }
        }
        if (capacity != 0) allocator.deallocate(values, capacity);
        values = new_values;
        capacity = new_capacity;
      }
      allocator.construct(&(values[id]), Value());
    }
};

这里分析的最短路径算法是dijkstra。这个算法里面的逻辑挺容易理解的。但是另一个最短路径算法floyd算法，代码五行，但是很不容易理解。
floyd算法

for(int k=1;k<=n;m++)
    for(int i=1;i<=n;i++)
        for(int j=1;j<=n;j++)
            d[i][j]=min(d[i][j],d[i][k]+d[k][j]);

找了一些资料[2,3,4]，看了之后，也是迷迷糊糊的。

从任意节点i到任意节点j的最短路径不外乎2种可能，1是直接从i到j，2是从i经过若干个节点k到j。所以，我们假设Dis(i,j)为节点u到节点v的最短路径的距离，对于每一个节点k，我们检查Dis(i,k) + Dis(k,j) < Dis(i,j)是否成立，如果成立，证明从i到k再到j的路径比i直接到j的路径短，我们便设置Dis(i,j) = Dis(i,k) + Dis(k,j)，这样一来，当我们遍历完所有节点k，Dis(i,j)中记录的便是i到j的最短路径的距离。

我觉得需要从这个推理公式理解，当k=1时，若在一次三层循环执行完毕后，Dis(i,1) + Dis(1,j) < Dis(i,j)成立，则i，j之间的最短路径必须进过m。接下来的循环就是遍历其他的k点，是否可以进一步使得i->1之间的距离进一步缩小（1->j同理）。
[1]lemon code
[2]最短路四大算法证明以及分析
[3]Floyd算法的证明
[4]最短路径—Dijkstra算法和Floyd算法

lemon 代码分析之文本解析
lemon主要用于处理图论相关的算法，最短路径，最大流最小割。lemon代码库中大量使用模板，c++语言的bla...
【柳珊·英语公众号学习笔记2018年6月11日Day2】
【今日俚语】 lemon俚语解析：差劲的车，差劲的产品【例句】To avoid buying a lemon, ...
用了那么多在线词云，终于发现了超好用的词云工具！
在python中，wordcloud和jieba等库都可以分析中文文本解析词频，但对于缺乏代码基础的人员来说，在线...
词法分析器
词法分析器词法分析器又称扫描器。词法分析是指将我们编写的文本代码流解析为一个一个的记号，分析得到的记号(Toke...
简书文章爬虫解析
核心解析解析过滤简书自定义标签，文章再用其他富文本逆向解析即可用vue quill editor解析完整代码
v-html中使用v-viewer
v-html解析后端返回的富文本,需求:解析后富文本中图片可点击放大查看代码如下: 注: 这也算是 v-view...
源码成神之路：Spring Framework源代码解析之IOC
在Spring Framework源代码解析之IOC容器（一）和Spring Framework源代码解析之IOC...
javascript项目上很难用到的基础知识
1.预解析解析JavaScript代码的时候，首先是执行语法分析，然后才是预解析。语法分析----------分...
模板
题目要求题目解析思路一：分析 text 代码段思路二：分析 text 代码段测试代码运行结果 tex...
macOS SwiftUI 实战项目SRT转文本之01 实现文本
实战需求 macOS SwiftUI 实战项目SRT转文本之01 实现文本打开和文本解析本文价值与收获看完本文...