rope 详解

konyakest · 2024-12-23 20:46:45 · 科技·工程

rope

rope，全程 __gnu_cxx::rope，是 gnu 提供的用来快速处理字符串（或者线性序列）的数据结构

定义以及接口

定义

rope 是一种高效的字符串数据结构，特别适用于频繁的字符串拼接和分割操作。它将字符串分割成多个小段，并将这些小段存储在树结构中。以下是 rope 的定义及其主要成员函数和数据成员的介绍。

接口

以下是 rope 的外部接口：

namespace __gnu_cxx {

template <class _CharT, class _Alloc = __pool_alloc>
class rope {
public:
    // 类型定义
    typedef _CharT value_type;
    typedef _CharT* pointer;
    typedef const _CharT* const_pointer;
    typedef _CharT& reference;
    typedef const _CharT& const_reference;
    typedef size_t size_type;
    typedef ptrdiff_t difference_type;
    typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
    typedef std::reverse_iterator<iterator> reverse_iterator;

    // 构造函数
    rope();
    rope(const _CharT* __s, size_type __len, const allocator_type& __a = allocator_type());
    rope(const _CharT* __s, const _CharT* __e, const allocator_type& __a = allocator_type());
    rope(const const_iterator& __s, const const_iterator& __e, const allocator_type& __a = allocator_type());
    rope(const iterator& __s, const iterator& __e, const allocator_type& __a = allocator_type());
    rope(size_type __n, _CharT __c, const allocator_type& __a = allocator_type());

    // 成员函数
    bool empty() const;
    size_type size() const;
    void clear();
    void insert(size_type __p, const rope& __r);
    void insert(size_type __p, const _CharT* __s, size_type __n);
    void insert(size_type __p, size_type __n, _CharT __c);
    void replace(size_type __p, size_type __n, const rope& __r);
    void replace(size_type __p, size_type __n, const _CharT* __s, size_type __slen);
    void replace(size_type __p, size_type __n, size_type __slen, _CharT __c);
    void erase(size_type __p, size_type __n);
    rope substr(size_type __p, size_type __n) const;
    size_type find(_CharT __c, size_type __pos = 0) const;
    size_type find(const _CharT* __s, size_type __pos = 0) const;
    size_type find(const rope& __r, size_type __pos = 0) const;

    // 迭代器
    const_iterator begin() const;
    const_iterator end() const;
    iterator begin();
    iterator end();
    const_reverse_iterator rbegin() const;
    const_reverse_iterator rend() const;
    reverse_iterator rbegin();
    reverse_iterator rend();
    iterator mutable_begin();
    iterator mutable_end();
    reverse_iterator mutable_rbegin();
    reverse_iterator mutable_rend();
    reference mutable_reference_at(size_type __pos);

    // 常量
    template <class _CharT, class _Alloc>
    const typename rope<_CharT, _Alloc>::size_type
        rope<_CharT, _Alloc>::npos = (size_type)(-1);

    // 下标访问
    _CharT operator[](size_type __pos) const;
    _CharT at(size_type __pos) const;

    // 友元函数
    friend rope operator+(const rope& __left, const rope& __right);
    friend bool operator==(const rope& __left, const rope& __right);
    friend bool operator!=(const rope& __left, const rope& __right);
    friend bool operator<(const rope& __left, const rope& __right);
    friend bool operator>(const rope& __left, const rope& __right);
    friend bool operator<=(const rope& __left, const rope& __right);
    friend bool operator>=(const rope& __left, const rope& __right);
};

} // namespace __gnu_cxx

需要注意的是，rope 的 operator[] 返回的是值。如果需要实现 r[0]=1 之类的语句，需要使用：

r.mutable_reference_at(0)=1;

或者使用：

r.mutable_begin()[4]=1;

还有一个不建议使用的方法，是在导入头文件前加入两行：

#define __STD_STUFF
#define _char_ref_proxy reference

这样就可以使用 operator[] 更改元素

存储结构

rope 的存储结构是一个平衡树，每个节点可以是叶子节点或内部节点。叶子节点存储实际的字符串数据，而内部节点存储指向子节点的指针

主要组件

_RopeRep：rope 的基本节点类型，所有节点类型都继承自 _RopeRep
_RopeLeaf：叶子节点，存储实际的字符串数据。
_RopeConcatenation：连接节点，表示两个子节点的拼接
_RopeFunction：函数节点，通过函数生成字符串数据
_RopeSubstring：子字符串节点，表示字符串的子串

`_RopeRep` 基类

所有节点类型都继承自 _RopeRep，它包含了节点的基本信息，如大小和引用计数

template <class _CharT, class _Alloc>
struct _RopeRep {
    typedef _RopeRep<_CharT, _Alloc> _Self;
    typedef size_t size_type;

    size_type _M_size; // 节点表示的字符串的大小
    unsigned char _M_depth; // 节点的深度
    mutable _Atomic_word _M_refcount; // 引用计数

    // 析构函数
    virtual ~_RopeRep() {}

    // 其他成员函数和数据成员
};

`_RopeLeaf` 叶子节点

叶子节点存储实际的字符串数据

template <class _CharT, class _Alloc>
struct _RopeLeaf : public _RopeRep<_CharT, _Alloc> {
    const _CharT* _M_data; // 字符串数据

    // 构造函数
    _RopeLeaf(const _CharT* __d, size_t __s, const _Alloc& __a)
        : _RopeRep<_CharT, _Alloc>(__s, 0), _M_data(__d) {}

    // 其他成员函数和数据成员
};

`_RopeConcatenation` 连接节点

连接节点表示两个子节点的拼接

template <class _CharT, class _Alloc>
struct _RopeConcatenation : public _RopeRep<_CharT, _Alloc> {
    _RopeRep<_CharT, _Alloc>* _M_left; // 左子节点
    _RopeRep<_CharT, _Alloc>* _M_right; // 右子节点

    // 构造函数
    _RopeConcatenation(_RopeRep<_CharT, _Alloc>* __l, _RopeRep<_CharT, _Alloc>* __r)
        : _RopeRep<_CharT, _Alloc>(__l->_M_size + __r->_M_size, std::max(__l->_M_depth, __r->_M_depth) + 1),
          _M_left(__l), _M_right(__r) {}

    // 其他成员函数和数据成员
};

`_RopeFunction` 函数节点

函数节点通过一个函数生成字符串数据

template <class _CharT, class _Alloc>
struct _RopeFunction : public _RopeRep<_CharT, _Alloc> {
    char_producer<_CharT>* _M_fn; // 用于生成字符串数据的函数对象
    bool _M_delete_when_done; // 是否在完成后删除函数对象

    // 构造函数
    _RopeFunction(char_producer<_CharT>* __fn, size_t __s, bool __d)
        : _RopeRep<_CharT, _Alloc>(__s, 0), _M_fn(__fn), _M_delete_when_done(__d) {}

    // 其他成员函数和数据成员
};

`_RopeSubstring` 子字符串节点

子字符串节点表示字符串的子串

template <class _CharT, class _Alloc>
struct _RopeSubstring : public _RopeRep<_CharT, _Alloc> {
    _RopeRep<_CharT, _Alloc>* _M_base; // 基础字符串
    size_t _M_start; // 子串的起始位置

    // 构造函数
    _RopeSubstring(_RopeRep<_CharT, _Alloc>* __b, size_t __s, size_t __l)
        : _RopeRep<_CharT, _Alloc>(__l, __b->_M_depth), _M_base(__b), _M_start(__s) {}

    // 其他成员函数和数据成员
};

说人话

简单总结，rope 存储结构是一颗 leafy 的平衡树（连接节点和叶子节点），而当你使用 substr 创建一个 slice 的时候会创建子字符串节点，这个节点只存储了一个其他节点的引用。当操作这个子字符串的时候才会进行修改

引用计数

_RopeRep 基类中存储了引用计数，这意味着 rope 的所有节点都有引用计数

以下是引用的增加与减少：

template <class _CharT, class _Alloc>
void rope<_CharT, _Alloc>::_S_ref(_RopeRep* __t) {
    if (__t != nullptr) {
        __atomic_add_fetch(&__t->_M_refcount, 1, __ATOMIC_ACQ_REL);
    }
}

template <class _CharT, class _Alloc>
void rope<_CharT, _Alloc>::_S_unref(_RopeRep* __t) {
    if (__t != nullptr && __atomic_sub_fetch(&__t->_M_refcount, 1, __ATOMIC_ACQ_REL) == 0) {
        // 引用计数变为零，释放节点的内存
        __t->_M_destroy();
    }
}

这个机制使得 rope 真正成为了持久化的数据结构。例如，rope 的 operator= 只是改变引用计数，而没有任何内存的操作，因此是 O(1) 的：

rope &
operator=(const rope &__x)
{
    _RopeRep *__old = this->_M_tree_ptr;
    this->_M_tree_ptr = __x._M_tree_ptr;
    _S_ref(this->_M_tree_ptr);
    _S_unref(__old);
    return *this;
}

平衡的维护

既然是平衡树，那么我们最在意的就是：它是如何维持平衡的？

我们发现，rope 节点只记录了深度和大小这两个信息，我们可以猜测：它根据深度判断大小是否合法，如果太小则重构

我们在连接树（即使用 substr 拼接）的时候，会调用 _S_concat 函数：

template <class _CharT, class _Alloc>
typename rope<_CharT, _Alloc>::_RopeRep *
rope<_CharT, _Alloc>::
    _S_concat(_RopeRep *__left, _RopeRep *__right)
{
    using std::size_t;
    if (0 == __left)
    {
        _S_ref(__right);
        return __right;
    }
    if (0 == __right)
    {
        __left->_M_ref_nonnil();
        return __left;
    }
    if (__detail::_S_leaf == __right->_M_tag)
    {
        if (__detail::_S_leaf == __left->_M_tag)
        {
            if (__right->_M_size + __left->_M_size <= size_t(_S_copy_max))
                return _S_leaf_concat_char_iter((_RopeLeaf *)__left,
                                                ((_RopeLeaf *)__right)->_M_data,
                                                __right->_M_size);
        }
        else if (__detail::_S_concat == __left->_M_tag && __detail::_S_leaf == ((_RopeConcatenation *)
                                                                                    __left)
                                                                                   ->_M_right->_M_tag)
        {
            _RopeLeaf *__leftright =
                (_RopeLeaf *)(((_RopeConcatenation *)__left)->_M_right);
            if (__leftright->_M_size + __right->_M_size <= size_t(_S_copy_max))
            {
                _RopeRep *__leftleft = ((_RopeConcatenation *)__left)->_M_left;
                _RopeRep *__rest = _S_leaf_concat_char_iter(__leftright,
                                                            ((_RopeLeaf *)
                                                                 __right)
                                                                ->_M_data,
                                                            __right->_M_size);
                __leftleft->_M_ref_nonnil();
                __try
                {
                    return (_S_tree_concat(__leftleft, __rest));
                }
                __catch(...)
                {
                    _S_unref(__leftleft);
                    _S_unref(__rest);
                    __throw_exception_again;
                }
            }
        }
    }
    __left->_M_ref_nonnil();
    __right->_M_ref_nonnil();
    __try
    {
        return (_S_tree_concat(__left, __right));
    }
    __catch(...)
    {
        _S_unref(__left);
        _S_unref(__right);
        __throw_exception_again;
    }
}

我们简化一下，是这样的：

当叶子节点合并，且 size 之和较小的时候，调用 _S_leaf_concat_char_iter 函数。这个函数就是直接将叶子节点 copy 成一个
否则，最终会调用 _S_tree_concat 函数

_S_tree_concat 函数：

template <class _CharT, class _Alloc>
typename rope<_CharT, _Alloc>::_RopeRep *
rope<_CharT, _Alloc>::
    _S_tree_concat(_RopeRep *__left, _RopeRep *__right)
{
    using std::size_t;
    _RopeConcatenation *__result = _S_new_RopeConcatenation(__left, __right,
                                                            __left->_M_get_allocator());
    size_t __depth = __result->_M_depth;

    if (__depth > 20 && (__result->_M_size < 1000 || __depth > size_t(__detail::_S_max_rope_depth)))
    {
        _RopeRep *__balanced;

        __try
        {
            __balanced = _S_balance(__result);
            __result->_M_unref_nonnil();
        }
        __catch(...)
        {
            rope::_C_deallocate(__result, 1);
            __throw_exception_again;
        }
        // In case of exception, we need to deallocate
        // otherwise dangling result node.  But caller
        // still owns its children.  Thus unref is
        // inappropriate.
        return __balanced;
    }
    else
        return __result;
}

其中，当

if (__depth > 20 && (__result->_M_size < 1000 || __depth > size_t(__detail::_S_max_rope_depth)))

（深度较大，且大小较小）的时候，会进行平衡维护

它是这样重构的：

template <class _CharT, class _Alloc>
typename rope<_CharT, _Alloc>::_RopeRep *
rope<_CharT, _Alloc>::
    _S_balance(_RopeRep *__r)
{
    __try
    {
        // 分解树并存储在森林中
        _RopeRep *__forest[int(__detail::_S_max_rope_depth) + 1] = {0};
        _S_add_to_forest(__r, __forest);

        // 重新组合子树
        _RopeRep *__result = 0;
        for (int __i = 0; __i <= int(__detail::_S_max_rope_depth); ++__i)
        {
            if (__forest[__i] != 0)
            {
                if (__result == 0)
                    __result = __forest[__i];
                else
                    __result = _S_tree_concat(__result, __forest[__i]);
            }
        }

        // 返回平衡后的树
        return __result;
    }
    __catch(...)
    {
        // 异常处理
        // ...
    }

    // 处理深度超过最大深度的情况
    if (__result->_M_depth > int(__detail::_S_max_rope_depth))
    {
        // 进一步平衡
        // ...
    }
}

分解树：
- _S_add_to_forest 函数用于将树分解成多个子树，并将这些子树添加到森林中。
- 森林是一个数组，每个元素存储一个子树。
重新组合子树：
- 遍历森林中的子树，将它们按照一定的规则重新组合成一个新的平衡树。
- _S_tree_concat 函数用于组合两个子树。
处理深度超过阈值的情况：
- 如果重新组合后的树的深度仍然超过阈值，则递归调用 _S_balance 函数，继续分解和组合，直到树的深度在允许范围内。

其中，_S_add_to_forest 是这样的：

template <class _CharT, class _Alloc>
void
rope<_CharT, _Alloc>::
    _S_add_to_forest(_RopeRep *__r, _RopeRep **__forest)
{
    if (__r->_M_is_balanced)
    {
        // 处理平衡的子树
        int __depth = __r->_M_depth;
        while (__forest[__depth] != 0)
        {
            __r = _S_tree_concat(__forest[__depth], __r);
            __forest[__depth] = 0;
            ++__depth;
        }
        __forest[__depth] = __r;
    }
    else
    {
        // 处理不平衡的子树
        _S_add_leaf_to_forest(__r, __forest);
    }
}

_S_tree_concat 是这样的：

template <class _CharT, class _Alloc>
typename rope<_CharT, _Alloc>::_RopeRep *
rope<_CharT, _Alloc>::
    _S_tree_concat(_RopeRep *__left, _RopeRep *__right)
{
    if (__left == 0)
        return __right;
    if (__right == 0)
        return __left;

    _RopeRep *__result = new _RopeRep;
    __result->_M_left = __left;
    __result->_M_right = __right;
    __result->_M_size = __left->_M_size + __right->_M_size;
    __result->_M_depth = std::max(__left->_M_depth, __right->_M_depth) + 1;
    __result->_M_is_balanced = true;

    return __result;
}

总之，rope 的平衡维护是很少的，只有两点：

__depth > 20 && (__result->_M_size < 1000 || __depth > size_t(__detail::_S_max_rope_depth))

的时候，进行重构

当叶子节点

__orig_size + __slen <= size_t(_S_copy_max)

的时候，将叶子节点合并，否则重构

可持久化平衡树？

很可惜，由于 rope 的平衡操作极其有限，且其在叶子节点存储了约 23 个数据，这使得在随机情况下常数优秀的同时，在持久化场景下空间常数巨大

所以，永远不要将 rope 当做持久化数据结构

灵活的绳子

正如它的名字，你可以将它当做一个这样的数据结构：

空间常数巨大、以至于基本不能持久化

由于这样的性质，它可以维护一些特殊的问题，比如序列翻转

我们可以维护一正一反两个 rope，每次取 substr 后重新拼接，即可实现