Files
allhaileris afb81b8278
Some checks failed
Docker. / Ubuntu (push) Has been cancelled
User-agent updater. / User-agent (push) Failing after 15s
Lock Threads / lock (push) Failing after 10s
Waiting for answer. / waiting-for-answer (push) Failing after 22s
Needs user action. / needs-user-action (push) Failing after 8s
Can't reproduce. / cant-reproduce (push) Failing after 8s
Close stale issues and PRs / stale (push) Has been cancelled
init
2026-02-16 15:50:16 +03:00

121 KiB
Raw Permalink Blame History

references:

Appendix 1: Sentinels and Code Generation

In this appendix we explore the effect of sentinels on code generation. I'll show that allowing the type of the end iterator to differ from the begin can have a positive effect on the performance of algorithms. First, I'll note that nothing that can be done with sentinels cannot also be done with appropriately designed end iterators. Here, for instance, is the code for an iterator that can be used to adapt a null-terminated string to the STL. It is implemented with the help of the Boost.Iterators library:

#include <cassert>
#include <iostream>
#include <boost/iterator/iterator_facade.hpp>

struct c_string_range
{
private:
    char const *str_;
public:
    using const_iterator = struct iterator
      : boost::iterator_facade<
            iterator
          , char const
          , std::forward_iterator_tag
        >
    {
    private:
        friend class boost::iterator_core_access;
        friend struct c_string_range;
        char const * str_;
        iterator(char const * str)
          : str_(str)
        {}
        bool equal(iterator that) const
        {
            return str_
                ? (that.str_ == str_ ||
                     (!that.str_ && !*str_))
                : (!that.str_ || !*that.str_);
        }
        void increment()
        {
            assert(str_ && *str_);
            ++str_;
        }
        char const& dereference() const
        {
            assert(str_ && *str_);
            return *str_;
        }
    public:
        iterator()
          : str_(nullptr)
        {}
    };
    c_string_range(char const * str)
      : str_(str)
    {
        assert(str_);
    }
    iterator begin() const
    {
        return iterator{str_};
    }
    iterator end() const
    {
        return iterator{};
    }
    explicit operator bool() const
    {
        return !!*str_;
    }
};

int c_strlen(char const *sz)
{
    int i = 0;
    for(; *sz; ++sz)
        ++i;
    return i;
}

int range_strlen(
    c_string_range::iterator begin,
    c_string_range::iterator end)
{
    int i = 0;
    for(; begin != end; ++begin)
        ++i;
    return i;
}

The code traverses the sequence of characters without first computing its end. It does it by creating a dummy end iterator such that any time a real iterator is compared to it, it checks to see if the real iterator points to the null terminator. All the comparison logic is in the c_string_range::iterator::equal member function.

The functions c_strlen and range_strlen implement equivalent procedures for computing the length of a string, the first using raw pointers and a check for the null terminator, the second using c_string_range's STL iterators. The resulting optimized assembly code (clang 3.4 -O3 -DNDEBUG) generated for the two functions highlights the lost optimization opportunities.

c_strlen
range_strlen
    pushl   %ebp
    movl    %esp, %ebp
    movl    8(%ebp), %ecx
    xorl    %eax, %eax
    cmpb    $0, (%ecx)
    je      LBB1_3
    xorl    %eax, %eax
    .align  16, 0x90
LBB1_2:
    cmpb    $0, 1(%ecx,%eax)
    leal    1(%eax), %eax
    jne     LBB1_2
LBB1_3:
    popl    %ebp
    ret
    pushl   %ebp
    movl    %esp, %ebp
    pushl   %esi
    leal    8(%ebp), %ecx
    movl    12(%ebp), %esi
    xorl    %eax, %eax
    testl   %esi, %esi
    movl    8(%ebp), %edx
    jne     LBB2_4
    jmp     LBB2_1
    .align  16, 0x90
LBB2_8:
    incl    %eax
    incl    %edx
    movl    %edx, (%ecx)
LBB2_4:
    testl   %edx, %edx
    jne     LBB2_5
    cmpb    $0, (%esi)
    jne     LBB2_8
    jmp     LBB2_6
    .align  16, 0x90
LBB2_5:
    cmpl    %edx, %esi
    jne     LBB2_8
    jmp     LBB2_6
    .align  16, 0x90
LBB2_3:
    leal    1(%edx,%eax), %esi
    incl    %eax
    movl    %esi, (%ecx)
LBB2_1:
    movl    %edx, %esi
    addl    %eax, %esi
    je      LBB2_6
    cmpb    $0, (%esi)
    jne     LBB2_3
LBB2_6:
    popl    %esi
    popl    %ebp
    ret

Code like c_string_range exists in the wild; for instance, see Beman Dawes' ntcts_iterator[@ntcts-iterator]. But more typically, when users want to use an STL algorithm on a C-style string, they call strlen to find the end first (this is what the standard regex algorithms do when passed C-style strings). That traverses the string an extra time needlessly. Also, such a trick is not possible for input sequences like those traversed by std::istream_iterator that consume their input.

Rather than mocking up a dummy end iterator with a computationally expensive equality comparison operation, we can use a sentinel type that encodes end-ness in its type. Below is an example from the Range-v3 library[@range-v3], which uses a range_facade class template to generate iterators and sentinels from a simple range-like interface:

using namespace ranges;
struct c_string_iterable
  : range_facade<c_string_iterable>
{
private:
    friend range_access;
    char const *sz_;
    char const & current() const { return *sz_; }
    void next() { ++sz_; }
    bool done() const { return *sz_ == 0; }
    bool equal(c_string_iterable const &that) const
    { return sz_ == that.sz_; }
public:
    c_string_iterable() = default;
    c_string_iterable(char const *sz)
        : sz_(sz) {}
};

// Iterable-based
int iterable_strlen(
    range_iterator_t<c_string_iterable> begin,
    range_sentinel_t<c_string_iterable> end)
{
    int i = 0;
    for(; begin != end; ++begin)
        ++i;
    return i;
}

The assembly generated for iterable_strlen is nearly identical to that for the hand-coded c_strlen:

c_strlen
iterable_strlen
    pushl   %ebp
    movl    %esp, %ebp
    movl    8(%ebp), %ecx
    xorl    %eax, %eax
    cmpb    $0, (%ecx)
    je      LBB1_3
    xorl    %eax, %eax
    .align  16, 0x90
LBB1_2:
    cmpb    $0, 1(%ecx,%eax)
    leal    1(%eax), %eax
    jne     LBB1_2

LBB1_3: popl %ebp ret

    pushl   %ebp
movl    %esp, %ebp
movl    8(%ebp), %ecx
xorl    %eax, %eax
cmpb    $0, (%ecx)
je      LBB1_4
leal    8(%ebp), %edx
.align  16, 0x90
LBB1_2:
cmpb    $0, 1(%ecx,%eax)
leal    1(%eax), %eax
jne     LBB1_2
addl    %eax, %ecx
movl    %ecx, (%edx)
LBB1_4:
popl    %ebp
ret

The generated code for iterable_strlen is better than for range_strlen because the sentinel has a different type than the iterator, so that the expression begin != end can be optimized into *begin == 0. In range_strlen, begin == end is comparing two objects of the same type, and since either begin or end could be a sentinel -- or both, or neither -- the compiler can't elide the extra checks without extra information from the surrounding calling context, which isn't always available; hence, the worse code gen.

In addition to the performance impact, the complexity of implementing a correct operator== for an iterator with a dummy sentinel can present problems. Chandler Carruth reports that such comparison operators have been a rich source of bugs for Google.

Appendix 2: Sentinels, Iterators, and the Cross-Type EqualityComparable Concept

This appendix describes the theoretical justification for sentinels from the perspective of the STL concepts as set out in N3351[@n3351]. In that paper, the foundational concept EqualityComparable is described in depth, not only its syntactic constraints but also its semantic axioms. It is not enough that the syntax a == b compiles. It has to be a meaningful comparison. Here I explain why I believe it is meaningful to compare an iterator with a sentinel of a different type for equality.

In the expression x == y, where x and y have different types, the EqualityComparable concept requires that the types of both x and y must themselves be EqualityComparable, and there must be a common type to which they can both be converted, and that type must also be EqualityComparable. Think of comparing a char with a short. It works because both char and short are EqualityComparable, and because they can both be converted to an int which is also EqualityComparable.

Iterators are comparable, and sentinels are trivially comparable (they always compare equal). The tricky part is the common type requirement. Logically, every iterator/sentinel pair has a common type that can be constructed as follows: assume the existence of a new iterator type I that is a tagged union that contains either an iterator or a sentinel. When an iterator is compared to a sentinel, it behaves semantically as if both the iterator and the sentinel were first converted to two objects of type I — call them lhs and rhs — and then compared according to the following truth table:

LHS IS SENTINEL ? RHS IS SENTINEL ? LHS == RHS ?
true true true[*]
true false done(rhs.iter)
false true done(lhs.iter)
false false lhs.iter == rhs.iter

In Appendix 1, there is an implementation of c_string_range whose iterator's operator== is a procedure for evaluating this truth table. Thats no coincidence; that was a special case of this more general construction.

In summary, for every iterator/sentinel pair, we can construct a common iterator type that implements an equivalent procedure for computing equality. The existence of this common type is what allows iterator/sentinel pairs to satisfy the EqualityComparable requirement.

As a final note, the Range v3 library has a general implementation of this common iterator type as a parametrized type, and appropriate specializations of std::common_type that allow the constrained algorithms to type-check correctly. It works well in practice, both for the purpose of type-checking the algorithms and for adapting ranges with iterator/sentinel pairs to old code that expects the begin and end of a range to have the same type.

Sentinel Equality

The first line of table above shows that sentinels always compare equal -- regardless of their state. This can seem unintuitive in some situations. For instance:

auto s = "Hello World!?";
// Hypothetical range-like facilities for exposition only:
auto r1 = make_range( &s[0], equal_to('!') );
auto r2 = make_range( &s[0], equal_to('?') );
// 'end()' returns each 'equal_to' sentinel object:
assert(r1.end() == r2.end() && "really?");

In the above example, it's clear that r1.end() and r2.end() are referring to different elements, and that's reflected in their having different state. So shouldn't sentinel equality be a stateful comparison?

It's incorrect to think of sentinels as objects whose position is determined by their value. Thinking that way leads to logical inconsistencies. It's not hard to define sentinels that -- if you consider their state during comparison -- compare equal when they represent a different position, and not equal when they represent the same position. Consider:

auto s = "hello! world!";
auto r1 = make_range( &s[0], equal_to('!') );
auto r2 = make_range( &s[7], equal_to('!') );

assert(r1.end() == r2.end() && "really?");

In the above code, although the sentinels have the same type and the same state, they refer to different elements.

Also imagine a counted iterator that stores an underlying iterator and a count that starts at zero. It's paired with a sentinel that contains an ending count:

auto s = "hello";
// Hypothetical range-like facilities for exposition only:
auto r1 = make_range( counting(&s[0]), end_count(5) );
auto r2 = make_range( counting(&s[1]), end_count(4) );

assert(r1.end() != r2.end() && "really?");

The end sentinels have the same type and different state, but they refer to the same element.

The question then is: What does it mean to compare sentinels for equality? The meaning has to be taken from the context in which the operation appears. In that context -- the algorithms -- the sentinel is one of two positions denoting a range.

Iterator comparison is a position comparison. A sentinel, when it's paired with an iterator as in the algorithms, denotes a unique position. Since iterators must be valid sentinels, comparing sentinels should also be a position comparison. In this context, sentinels always represent the same position -- the end -- even though that position is not yet know. Hence, sentinels compare equal always, without any need to consider state. Anything else yields results that are inconsistent with the use of iterators to denote the end of a range.

What about the examples above where sentinel comparison seems to give nonsensical results? In those examples, we are expecting sentinels to have meaning outside of their valid domain -- when they are considered in isolation of their paired iterator. It is the paired iterator that give the sentinel its "unique-position-ness", so we shouldn't expect an operation that compares position to make sense on an object that is not capable of representing a position by itself.

Appendix 3: D Ranges and Algorithmic Complexity

As currently defined, D's ranges make it difficult to implement algorithms over bidirectional sequences that take a position in the middle as an argument. A good example is a hypothetical is_word_boundary algorithm. In C++ with iterators, it might look like this:

template< BidirectionalIterator I >
bool is_word_boundary( I begin, I middle, I end )
{
    bool is_word_prev = middle == begin ? false : isword(*prev(middle));
    bool is_word_this = middle == end ? false : isword(*middle);
    return is_word_prev != is_word_this;
}

Users might call this in a loop to find the first word boundary in a range of characters as follows:

auto i = myrange.begin();
for( ; i != myrange.end(); ++i )
    if( is_word_boundary( myrange.begin(), i, myrange.end() ) )
        break;

The question is how such an API should be designed in D, since D doesn't have iterators. In a private email exchange, Andrei Alexandrescu, the designer of D's range library, described this potential implementation (the detailed implementation is ours):

bool is_word_boundary(Range1, Range2)( Range1 front, Range2 back )
    if (isBidirectionalRange!Range1 && isInputRange!Range2 )
{
    bool is_word_prev = front.empty ? false : isword(front.back);
    bool is_word_this = back.empty ? false : isword(back.front);
    return is_word_prev != is_word_this;
}

range r = myrange;
size_t n = 0;
for(range r = myrange; !r.empty; r.popFront(), ++n)
     if( is_word_boundary( takeExactly(myrange, n), r) )
        break;

This example uses D's takeExactly range adaptor. takeExactly is like Haskell's take function which creates a list from another list by taking the first n elements, but takeExactly requires that the input range has at least n elements begin with.

The trouble with the above implementation is that takeExactly demotes Bidirectional ranges to Forward ranges. For example, taking the first N elements of a linked list cannot give access to the final element in O(1). So the for loop is trying to pass a Forward range to an algorithm that clearly requires a Bidirectional range. The only fix is to loosen the requirements of Range1 to be Forward. To do that, the implementation of is_word_boundary needs to change so that, when it is passed a Forward range, it walks to the end of the front range and tests the last character. Obviously, that's an O(N) operation.

In other words, by converting the is_word_boundary from iterators to D-style ranges, the algorithm goes from O(1) to O(N). From this we draw the following conclusion: D's choice of algorithmic basis operations is inherently less efficient than C++'s.

Appendix 4: On Counted Ranges and Efficiency

The three types of ranges that we would like the Iterable concept to be able to efficiently model are:

  1. Two iterators
  2. An iterator and a predicate
  3. An iterator and a count

The Iterator/Sentinel abstraction is what makes it possible for the algorithms to handle these three cases with uniform syntax. However, the third option presents challenges when trying to make some algorithms optimally efficient.

Counted ranges, formed by specifying a position and a count of elements, have iterators -- as all Iterables do. The iterators of a counted range must know the range's extent and how close they are to reaching it. Therefore, the counted range's iterators must store both an iterator into the underlying sequence and a count -- either a count to the end or a count from the front. Here is one potential design:

class counted_sentinel
{};

template<WeakIterator I>
class counted_iterator
{
    I it_;
    DistanceType<I> n_; // distance to end
public:
    // ... constructors...
    using iterator_category =
        typename iterator_traits<I>::iterator_category;
    decltype(auto) operator*() const
    {
        return *it_;
    }
    counted_iterator & operator++()
    {
        ++it_;
        --n_;
        return *this;
    }
    friend bool operator==(counted_iterator const & it,
                           counted_sentinel)
    {
        return it.n_ == 0;
    }
    // ... other operators...
};

template<WeakIterator I>
class counted_range
{
    I begin_;
    DistanceType<I> count_;
public:
    // ... constructors ...
    counted_iterator<I> begin() const
    {
        return {begin_, count_};
    }
    counted_sentinel end() const
    {
        return {};
    }
};

template<WeakIterator I>
struct common_type<counted_iterator<I>, counted_sentinel>
    // ... see Appendix 2 ...

There are some noteworthy things about the code above. First, counted_iterator bundles an iterator and a count. Right off, we see that copying counted iterators is going to be more expensive, and iterators are copied frequently. A mitigating factor is that the sentinel is empty. Passing a counted_iterator and a counted_sentinel to an algorithm copies as much data as passing an iterator and a count. When passed separately, the compiler probably has an easier time fitting them in registers, but some modern compilers are capable passing the members of a struct in registers. This compiler optimization is sometimes called Scalar Replacement of Aggregates (see Muchnick 1997, ch. 12.2 [@muchnick97]) and is known to be implemented in gcc and LLVM (see this recent LLVM commit[@llvm-sroa] for example).

Also, incrementing a counted iterator is expensive: it involves incrementing the underlying iterator and decrementing the internal count. To see why this is potentially expensive, consider the trivial case of passing a counted_iterator<list<int>::iterator> to advance. That counted iterator type is bidirectional, and advance must increment it n times:

template<BidirectionalIterator I>
void advance(I & i, DistanceType<I> n)
{
    if(n >= 0)
        for(; n != 0; --n)
            ++i;
    else
        for(; n != 0; ++n)
            --i;
}

Notice that for each ++i or --i here, two increments or decrements are happening when I is a counted_iterator. This is sub-optimal. A better implementation for counted_iterator is:

template<BidirectionalIterator I>
void advance(counted_iterator<I> & i, DistanceType<I> n)
{
    i.n_ -= n;
    advance(i.it_, n);
}

This has a noticeable effect on the generated code. As it turns out, advance is one of the relatively few places in the standard library where special handling of counted_iterator is advantageous. Let's examine some algorithms to see why that's the case.

Single-Pass Algorithms with Counted Iterators

First, let's look at a simple algorithm like for_each that makes exactly one pass through its input sequence:

template<InputIterator I, Regular S, Function<ValueType<I>> F>
    requires EqualityComparable<I, S>
I for_each(I first, S last, F f)
{
    for(; first != last; ++first)
        f(*first);
    return first;
}

When passed counted iterators, at each iteration of the loop, we do an increment, a decrement (for the underlying iterator and the count), and a comparison. Let's compare this against a hypothetical for_each_n algorithm that takes the underlying iterator and the count separately. It might look like this:

template<InputIterator I, Function<ValueType<I>> F>
I for_each_n(I first, DifferenceType<I> n, F f)
{
    for(; n != 0; ++first, --n)
        f(*first);
    return first;
}

For the hypothetical for_each_n, at each loop iteration, we do an increment, a decrement, and a comparison. That's exactly as many operations as for_each does when passed counted iterators. So a separate for_each_n algorithm is probably unnecessary if we have sentinels and counted_iterators. This is true for any algorithm that makes only one pass through the input range. That turns out to be a lot of algorithms.

Multi-Pass Algorithms with Counted Iterators

There are other algorithms that make more than one pass over the input sequence. Most of those, however, use advance when they need to move iterators by more than one hop. Once we have specialized advance for counted_iterator, those algorithms that use advance get faster without any extra work.

Consider partition_point. Here is one example implementation, taken from libc++[@libcxx] and ported to Concepts Lite and sentinels:

template<ForwardIterator I, Regular S, Predicate<ValueType<I>> P>
    requires EqualityComparable<I, S>
I partition_point(I first, S last, P pred)
{
    DifferenceType<I> len = distance(first, last);
    while (len != 0)
    {
        DifferenceType<I> l2 = len / 2;
        I m = first;
        advance(m, l2);
        if (pred(*m))
        {
            first = ++m;
            len -= l2 + 1;
        }
        else
            len = l2;
    }
    return first;
}

Imagine that I is a forward counted_iterator and S is a counted_sentinel. If the library does not specialize advance, this is certainly inefficient. Every time advance is called, needless work is being done. Compare it to a hypothetical partition_point_n:

template<ForwardIterator I, Predicate<ValueType<I>> P>
I partition_point_n(I first, DifferenceType<I> len, P pred)
{
    while (len != 0)
    {
        DifferenceType<I> l2 = len / 2;
        I m = first;
        advance(m, l2);
        if (pred(*m))
        {
            first = ++m;
            len -= l2 + 1;
        }
        else
            len = l2;
    }
    return first;
}

The first thing we notice is that partition_point_n doesn't need to call distance! The more subtle thing to note is that calling partition_point_n with a raw iterator and a count saves about O(N) integer decrements over the equivalent call to partition_point with counted_iterators ... unless, of course, we have specialized the advance algorithm as shown above. Once we have, we trade the O(N) integer decrements for O(log N) integer subtractions. That's a big improvement.

But what about the O(N) call to distance? Actually, that's easy, and it's the reason why the SizedIteratorRange concept exists. counted_iterator stores the distance to the end. So the distance between a counted_iterator and a counted_sentinel (or between two counted_iterators) is known in O(1) regardless of the iterator's category. The SizedIteratorRange concept tests whether an iterator I and a sentinel S can be subtracted to get the distance. This concept is modeled by random-access iterators by their nature, but also by counted iterators and their sentinels. The distance algorithm is specialized for SizedIteratorRange, so it is O(1) for counted iterators.

With these changes, we see that partition_point with counted iterators is very nearly as efficient as a hypothetical partition_point_n would be, and we had to make no special accommodations. Why can't we make partition_point exactly as efficient as partition_point_n? When partition_point is called with a counted iterator, it also returns a counted iterator. Counted iterators contain two datums: the position and distance to the end. But when partition_point_n returns just the position, it is actually computing and returning less information. Sometimes users don't need the extra information. But sometimes, after calling partition_point_n, the user might want to pass the resulting iterator to another algorithm. If that algorithm calls distance (like partition_point and other algorithms do), then it will be O(N). With counted iterators, however, it's O(1). So in the case of partition_point, counted iterators cause the algorithm to do O(log N) extra work, but it sometimes saves O(N) work later.

To see an example, imagine a trivial insertion_sort algorithm:

template<ForwardIterator I, Regular S>
    requires EqualityComparable<I, S> && Sortable<I> // from N3351
void insertion_sort(I begin, S end)
{
    for(auto it = begin; it != end; ++it)
    {
        auto insertion = upper_bound(begin, it, *it);
        rotate(insertion, it, next(it));
    }
}

Imagine that I is a counted_iterator. The first thing upper_bound does is call distance. Making distance O(1) for counted_iterators saves N calls of an O(N) algorithm. To get comparable performance for an equivalent procedure in today's STL, users would have to write a separate insertion_sort_n algorithm that dispatches to an upper_bound_n algorithm -- that they would also need to write themselves.

Counted Algorithms with Counted Iterators

We've seen that regular algorithms with counted iterators can be made nearly as efficient as dedicated counted algorithms, and that sometimes we are more than compensated for the small performance loss. All is not roses, however. There are a number of counted algorithms in the standard (the algorithms whose names end with _n). Consider copy_n:

template<WeakInputIterator I, WeakOutputIterator<ValueType<I>> O>
pair<I, O> copy_n(I in, DifferenceType<I> n, O out)
{
    for(; n != 0; ++in, ++out, --n)
        *out = *in;
    return {in, out};
}

(We've changed the return type of copy_n so as not to lose information.) If I is a counted iterator, then for every ++in, an increment and a decrement are happening, and in this case the extra decrement is totally unnecessary. For any counted (i.e., _n) algorithm, something special needs to be done to keep the performance from degrading when passed counted iterators.

The algorithm author has two options here, and neither of them is ideal.

Option 1: Overload the algorithm

The following is an optimized version of copy_n for counted iterators:

template<WeakInputIterator I, WeakOutputIterator<ValueType<I>> O>
pair<I, O> copy_n(counted_iterator<I> in, DifferenceType<I> n, O out)
{
    for(auto m = in.n_ - n; in.n_ != m; ++in.i_, --in.n_, ++out)
        *out = *in;
    return {in, out};
}

Obviously, creating an overload for counted iterators is unsatisfying.

Option 2: Separate the iterator from the count

This option shows how a library implementer can write just one version of copy_n that is automatically optimized for counted iterators. First, we need to provide two utility functions for unpacking and repacking counted iterators:

template<WeakIterator I>
I uncounted(I i)
{
    return i;
}

template<WeakIterator I>
I uncounted(counted_iterator<I> i)
{
    return i.it_;
}

template<WeakIterator I>
I recounted(I const &, I i, DifferenceType<I>)
{
    return i;
}

template<WeakIterator I>
counted_iterator<I> recounted(counted_iterator<I> const &j, I i, DifferenceType<I> n)
{
    return {i, j.n_ - n};
}

With the help of uncounted and recounted, we can write an optimized copy_n just once:

template<WeakInputIterator I, WeakOutputIterator<ValueType<I>> O>
pair<I, O> copy_n(I in_, DifferenceType<I> n_, O out)
{
    auto in = uncounted(in_);
    for(auto n = n_; n != 0; ++in, --n, ++out)
        *out = *in;
    return {recounted(in_, in, n_), out};
}

This version works optimally for both counted and non-counted iterators. It is not a thing of beauty, however. It's slightly annoying to have to do the uncounted/recounted dance, but it's mostly needed only in the counted algorithms.

As a final note, the overload of advance for counted iterators can be eliminated with the help of uncounted and recounted. After all, advance is a counted algorithm.

Benchmark: Insertion Sort

To test how expensive counted ranges and counted iterators are, we wrote a benchmark. The benchmark program pits counted ranges against a dedicated _n algorithm for Insertion Sort.

The program is listed below:

#include <chrono>
#include <iostream>
#include <range/v3/range.hpp>

class timer
{
private:
    std::chrono::high_resolution_clock::time_point start_;
public:
    timer()
    {
        reset();
    }
    void reset()
    {
        start_ = std::chrono::high_resolution_clock::now();
    }
    std::chrono::milliseconds elapsed() const
    {
        return std::chrono::duration_cast<std::chrono::milliseconds>(
            std::chrono::high_resolution_clock::now() - start_);
    }
    friend std::ostream &operator<<(std::ostream &sout, timer const &t)
    {
        return sout << t.elapsed().count() << "ms";
    }
};

template<typename It>
struct forward_iterator
{
    It it_;

    template<typename U>
    friend struct forward_iterator;
public:
    typedef          std::forward_iterator_tag                 iterator_category;
    typedef typename std::iterator_traits<It>::value_type      value_type;
    typedef typename std::iterator_traits<It>::difference_type difference_type;
    typedef It                                                 pointer;
    typedef typename std::iterator_traits<It>::reference       reference;

    forward_iterator() : it_() {}
    explicit forward_iterator(It it) : it_(it) {}

    reference operator*() const {return *it_;}
    pointer operator->() const {return it_;}

    forward_iterator& operator++() {++it_; return *this;}
    forward_iterator operator++(int)
        {forward_iterator tmp(*this); ++(*this); return tmp;}

    friend bool operator==(const forward_iterator& x, const forward_iterator& y)
        {return x.it_ == y.it_;}
    friend bool operator!=(const forward_iterator& x, const forward_iterator& y)
        {return !(x == y);}
};

template<typename I, typename V2>
I upper_bound_n(I begin, typename std::iterator_traits<I>::difference_type d, V2 const &val)
{
    while(0 != d)
    {
        auto half = d / 2;
        auto middle = std::next(begin, half);
        if(val < *middle)
            d = half;
        else
        {
            begin = ++middle;
            d -= half + 1;
        }
    }
    return begin;
}

template<typename I>
void insertion_sort_n(I begin, typename std::iterator_traits<I>::difference_type n)
{
    auto m = 0;
    for(auto it = begin; m != n; ++it, ++m)
    {
        auto insertion = upper_bound_n(begin, m, *it);
        ranges::rotate(insertion, it, std::next(it));
    }
}

template<typename I, typename S>
void insertion_sort(I begin, S end)
{
    for(auto it = begin; it != end; ++it)
    {
        auto insertion = ranges::upper_bound(begin, it, *it);
        ranges::rotate(insertion, it, std::next(it));
    }
}

template<typename Rng>
void insertion_sort(Rng && rng)
{
    ::insertion_sort(std::begin(rng), std::end(rng));
}

std::unique_ptr<int[]> data(int i)
{
    std::unique_ptr<int[]> a(new int[i]);
    auto rng = ranges::view::counted(a.get(), i);
    ranges::iota(rng, 0);
    return a;
}

void shuffle(int *a, int i)
{
    auto rng = ranges::view::counted(a, i);
    ranges::random_shuffle(rng);
}

constexpr int cloops = 3;

template<typename I>
void benchmark_n(int i)
{
    auto a = data(i);
    long ms = 0;
    for(int j = 0; j < cloops; ++j)
    {
        ::shuffle(a.get(), i);
        timer t;
        insertion_sort_n(I{a.get()}, i);
        ms += t.elapsed().count();
    }
    std::cout << (int)((double)ms/cloops) << std::endl;
}

template<typename I>
void benchmark_counted(int i)
{
    auto a = data(i);
    long ms = 0;
    for(int j = 0; j < cloops; ++j)
    {
        ::shuffle(a.get(), i);
        timer t;
        insertion_sort(ranges::view::counted(I{a.get()}, i));
        ms += t.elapsed().count();
    }
    std::cout << (int)((double)ms/cloops) << std::endl;
}

int main(int argc, char *argv[])
{
    if(argc < 2)
        return -1;

    int i = std::atoi(argv[1]);
    std::cout << "insertion_sort_n (random-access) : ";
    benchmark_n<int*>(i);
    std::cout << "insertion_sort   (random-access) : ";
    benchmark_counted<int*>(i);
    std::cout << "\n";
    std::cout << "insertion_sort_n (forward)       : ";
    benchmark_n<forward_iterator<int*>>(i);
    std::cout << "insertion_sort   (forward)       : ";
    benchmark_counted<forward_iterator<int*>>(i);
}

The program implements both insertion_sort_n, a dedicated counted algorithm, and insertion_sort, a general algorithm that accepts any Iterable, to which we pass a counted range. The latter is implemented in terms of the general-purpose upper_bound as provided by the Range v3 library, whereas the former requires a dedicated upper_bound_n algorithm, which is also provided.

The test is run both with raw pointers (hence, random-access) and with an iterator wrapper that only models ForwardIterator. Each test is run three times, and the resulting times are averaged. The test was compiled with g++ version 4.9.0 with -O3 -std=gnu++11 -DNDEBUG and run on a Linux machine. The results are reported below, for N == 30,000:

  insertion_sort_n insertion_sort
random-access 2.692 s 2.703 s
forward 23.853 s 23.817 s

The performance difference, if there is any, is lost in the noise. At least in this case, with this compiler, on this hardware, there is no performance justification for a dedicated _n algorithm.

Summary

In short, counted iterators are not a perfect abstraction. There is some precedent here. The iterators for deque, and for any segmented data structure, are known to be inefficient (see Segmented Iterators and Hierarchical Algorithms, Austern 1998[@austern98]). The fix for that problem, new iterator abstractions and separate hierarchical algorithm implementations, is invasive and is not attempted in any STL implementation we are aware of. In comparison, the extra complications that come with counted iterators seem quite small. For segmented iterators, the upside was the simplicity and uniformity of the Iterator abstraction. In the case of counted ranges and iterators, the upside is the simplicity and uniformity of the Iterable concept. Algorithms need only one form, not separate bounded, counted, and sentinel forms. Our benchmark gives us reasonable assurance that we aren't sacrificing performance for the sake of a unifying abstraction.

Appendix 5: Drive-By Improvements to the Standard Algorithms

As we are making changes to the standard algorithms, not all of which are strictly source compatible, here are some other drive-by changes that we might consider making. The changes suggested below have nothing to do with ranges per se, but they increase the power and uniformity of the STL and they have proven useful in the Adobe Source Library, so we might consider taking all these changes in one go.

Higher-Order Algorithms Should Take Invokables Instead of Functions

Some algorithms like for_each are higher-order; they take functions as parameters. In N3351[@n3351], they are constrained with the Function concept which, among other things, requires that its parameters can be used to form a valid callable expression f(a).

However, consider a class S with a member function Do, like:

class S {
public:
    void Do() const;
};

If we have a vector of S objects and we want to Do all of them, this is what we need to do:

for_each( v, [](auto & s) { s.Do(); });

or, more concisely with a bind expression:

for_each( v, bind(&S::Do, _1) );

Note that bind is specified in terms of a hypothetical INVOKE utility in [func.require]. Wouldn't it be more convenient if all the algorithms were required to merely take INVOKE-able things -- that is, things that can be passed to bind -- as arguments, instead of Functions? Then we can express the above call to for_each more concisely as:

for_each( v, &S::Do );

We can define an invokable utility function as:

template<typename R, typename T>
auto invokable(R T::* p) const -> decltype(std::mem_fn(p))
{
    return std::mem_fn(p);
}

template<typename T, typename U = decay_t<T>>
auto invokable(T && t) const -> enable_if_t<!is_member_pointer<U>::value, T>
{
    return std::forward<T>(t);
}

template<typename F>
using invokable_t = decltype(invokable(std::declval<F>()));

We can define an Invokable concept as:

concept Invokable<Semiregular F, typename... As> =
    Function<invokable_t<F>, As...> &&
    requires (F f, As... as) {
        InvokableResultOf<F, As...>;
        InvokableResultOf<F, As...> == invokable(f)(as...);
    };

The Invokable concept can be used to constrain algorithms instead of the Function concept. The algorithms would need to apply invokable to each Invokable argument before invoking it.

This is pure extension and would break no code.

Algorithms Should Take Invokable Projections

The Adobe Source Libraries (ASL)[@asl] pioneered the use of "projections" to make the algorithms more powerful and expressive by increasing interface symmetry. Sean Parent gives a motivating example in his "C++ Seasoning" talk[@cpp-seasoning], on slide 38. With today's STL, when using sort and lower_bound together with user-defined predicates, the predicate must sometimes differ. Consider:

std::sort(a, [](const employee& x, const employee& y)
             { return x.last < y.last; });
auto p = std::lower_bound(a, "Parent", [](const employee& x, const string& y)
                                       { return x.last < y; });

Notice the different predicates used in the invocations of sort and lower_bound. Since the predicates are different, there is a chance they might get out of sync leading to subtle bugs.

By introducing the use of projections, this code is simplified to:

std::sort(a, std::less<>(), &employee::last);
auto p = std::lower_bound(a, "Parent", std::less<>(), &employee::last);

Every element in the input sequence is first passed through the projection &employee::last. As a result, the simple comparison predicate std::less<> can be used in both places.

No effort was made in ASL to use projections consistently. This proposal bakes them in everywhere it makes sense. Here are the guidelines to be applied to the standard algorithms:

  • Wherever appropriate, algorithms should optionally take INVOKE-able projections that are applied to each element in the input sequence(s). This, in effect, allows users to trivially transform each input sequence for the sake of that single algorithm invocation.
  • Algorithms that take two input sequences should (optionally) take two projections.
  • For algorithms that optionally accept functions/predicates (e.g. transform, sort), projection arguments follow functions/predicates. There are no algorithm overloads that allow the user to specify the projection without also specifying a predicate, even if the default would suffice. This is to reduce the number of overloads and also to avoid any potential for ambiguity.

An open design question is whether all algorithms should take projections, or only the higher-order algorithms. In our current design, all algorithms take projections. This makes it trivial to, say, copy all the keys of a map by using the standard copy algorithm with &pair<const key,value>::first as the projection.

Projections versus Range Transform View

In a sense, the use of a projection parameter to an algorithm is similar to applying a transform view directly to a range. For example, calling std::find with a projection is similar to applying a transform to a range and calling without the projection:

auto it = std::find( a, 42, &employee::age );

auto a2 = a | view::transform( &employee::age );
auto it2 = std::find( a2, 42 );

Aside from the extra verbosity of the view-based solution, there are two meaningful differences: (1) The type of the resulting iterator is different; *it refers to an employee whereas *it2 refers to an int. And (2) if the transform function returns an rvalue, then the transformed view cannot model a forward sequence due to the current requirements on the ForwardIterator concept. The result of applying a transform view is an Input range unless the transform function returns an lvalue. The projection-based interface suffers no such degradation of the iterator category. (Note: if the concepts in N3351[@n3351] are adopted, this argument is no longer valid.) For those reasons, range transform adapters are not a replacement for projection arguments to algorithms.

See Algorithm Implementation with Projections for a discussion of how projections affect the implementation.

The addition of projection arguments to the algorithms is pure extension.

Appendix 6: Implementation Notes

On Distinguishing Ranges from Non-Range Iterables

The design of the range library depends on the ability to tell apart Ranges from Iterables. Ranges are lightweight objects that refer to elements they do not own. As a result, they can guarantee O(1) copyability and assignability. Iterables, on the other hand, may or may not own their elements, and so cannot guarantee anything about the algorithmic complexity of their copy and assign operations. Indeed, an Iterable may not be copyable at all: it may be a native array or a vector of move-only types.

But how to tell Ranges apart from Iterables? After all, whether an Iterable owns its elements or not is largely a semantic difference with no purely syntactic way to differentiate. Well, that's almost true...

It turns out that there is a reasonably good heuristic we can use to tell Iterables and Ranges apart. Imagine that we have some Iterable type T that is either a container like list, vector, or a native array; or else it's a Range like pair<int*,int*>. Then we can imagine taking iterators to T and const T, dereferencing them, and comparing the resulting reference types. The following table gives the results we might expect to find.

Expression Container Range
*begin(declval<T&>()) value_type & [const] value_type &
*begin(declval<const T&>()) const value_type & [const] value_type &

Notice how containers and ranges differ in the way a top-level cv-qualifier affects the reference type. Since a range is a proxy to elements stored elsewhere, a top-level const qualification on the range object typically has no effect at all on its iterator's reference type. But that's not true for a container that owns its elements.

We can use this to build an is_range traits that gives a pretty good guess whether an Iterable type is a range or not. This trait can be used to define the Range concept. Obviously since it's a trait, users are free to specialize it if the trait guesses wrong.

Some people want their range types to behave like containers with respect to the handling of top-level const; that is, they would like their ranges to be designed such that if the range object is const, the range's elements cannot be mutated through it. There is nothing about the Range concepts that precludes that design, but it does require the developer of such a range to specialize the is_range trait. If anything, the default behavior of the trait can be seen as gentle encouragement to handle top-level const in a way that is consistent with ranges' nature as a lightweight proxy.

As an added convenience, we can provide a class, range_base, from which users can create a derived type as another way of opting in to "range-ness". The is_range trait can test for derivation from range_base as an extra test. This would save users the trouble of opening the std namespace to specialize the is_range trait on the rare occasions that that is necessary.

The is_range trait will also need to be specialized for immutable containers, for which both the mutable and const iterators have the same reference type. A good example is std::set.

If the is_range trait erroneously reports false for a type that is actually a range, then the library errs on the side of caution and will prevent the user from using rvalues of that type in range adaptor pipelines. If, on the other hand, the is_range trait gets the answer wrong for a type that is actually a container, the container ends up being copied or moved into range adaptors. This is a performance bug, and it may give surprising results at runtime if the original container doesn't get mutated when the user thinks it should. It's not a memory error, though.

Algorithm Implementation with Projections

Rather than requiring additional overloads, the addition of projection arguments has very little cost to library implementers. The use of function template default parameters obviates the need for overloads. For instance, find can be defined as:

template<InputIterator I, Regular S, typename V, Invokable<ValueType<I>> Proj = identity>
    requires EqualityComparable<I, S> &&
             EqualityComparable<V, InvokableResultOf<Proj, ValueType<I>>>
I find(I first, S last, V const & val, Proj proj = Proj{})
{
    /* ... */
}

Algorithms That Need An End Iterator

Some algorithms need to know the real physical end of the input sequence so that the sequence can be traversed backwards, like reverse. In those cases, it's helpful to have an algorithm advance_to that takes an iterator and a sentinel and returns a real end iterator. advance_to looks like this:

template<Iterator I, Regular S>
    requires IteratorRange<I, S>
I advance_to( I i, S s )
{
    while(i != s)
        ++i;
    return i;
}

template<Iterator I, Regular S>
    requires SizedIteratorRange<I, S>
I advance_to( I i, S s )
{
    advance( i, s - i );
    return i;
}

template<Iterator I>
I advance_to( I, I s )
{
    return s;
}

When the sentinel is actually an iterator, we already know where the end is so we can just return it. Notice how we handle SizedIteratorRanges specially and dispatch to advance with a count. Appendix 4 shows how advance is optimized for counted iterators. By dispatching to advance when we can, we make advance_to faster for counted iterators, too.

With advance_to we can implement reverse generically as:

template<BidirectionalIterator I, Regular S>
    requires EqualityComparable<I, S> && Permutable<I>
I reverse( I first, S last_ )
{
    I last = advance_to( first, last_ ), end = last;
    while( first != last )
    {
        if( first == --last )
            break;
        iter_swap( first, last );
        ++first;
    }
    return end;
}

Since this algorithm necessarily computes the end of the sequence if it isn't known already, we return it.