PostgreSQL 源码解读（194）- 查询#110(排序#3 - 实现)

发布时间：2020-08-13 15:27:35 作者：husthxd
来源：ITPUB博客阅读：484

本节继续介绍排序的实现,上一节介绍了ExecSort,本节介绍该函数中调用的排序的具体实现函数,本节是第一部分,包括tuplesort_begin_heap/tuplesort_puttupleslot.

一、数据结构

SortState
排序运行期状态信息


/* ----------------
 *     SortState information
 *     排序运行期状态信息
 * ----------------
 */
typedef struct SortState
{
    //基类
    ScanState    ss;                /* its first field is NodeTag */
    //是否需要随机访问排序输出?
    bool        randomAccess;    /* need random access to sort output? */
    //结果集是否存在边界?
    bool        bounded;        /* is the result set bounded? */
    //如存在边界,需要多少个元组?
    int64        bound;            /* if bounded, how many tuples are needed */
    //是否已完成排序?
    bool        sort_Done;        /* sort completed yet? */
    //是否使用有界值?
    bool        bounded_Done;    /* value of bounded we did the sort with */
    //使用的有界值?
    int64        bound_Done;        /* value of bound we did the sort with */
    //tuplesort.c的私有状态
    void       *tuplesortstate; /* private state of tuplesort.c */
    //是否worker?
    bool        am_worker;        /* are we a worker? */
    //每个worker对应一个条目
    SharedSortInfo *shared_info;    /* one entry per worker */
} SortState;
/* ----------------
 *     Shared memory container for per-worker sort information
 *     per-worker排序信息的共享内存容器
 * ----------------
 */
typedef struct SharedSortInfo
{
    //worker个数?
    int            num_workers;
    //排序机制
    TuplesortInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER];
} SharedSortInfo;

TuplesortInstrumentation
报告排序统计的数据结构.


/*
 * Data structures for reporting sort statistics.  Note that
 * TuplesortInstrumentation can't contain any pointers because we
 * sometimes put it in shared memory.
 * 报告排序统计的数据结构.
 * 注意TuplesortInstrumentation不能包含指针因为有时候会把该结构体放在共享内存中.
 */
typedef enum
{
    SORT_TYPE_STILL_IN_PROGRESS = 0,//仍然在排序中
    SORT_TYPE_TOP_N_HEAPSORT,//TOP N 堆排序
    SORT_TYPE_QUICKSORT,//快速排序
    SORT_TYPE_EXTERNAL_SORT,//外部排序
    SORT_TYPE_EXTERNAL_MERGE//外部排序后的合并
} TuplesortMethod;//排序方法
typedef enum
{
    SORT_SPACE_TYPE_DISK,//需要用上磁盘
    SORT_SPACE_TYPE_MEMORY//使用内存
} TuplesortSpaceType;
typedef struct TuplesortInstrumentation
{
    //使用的排序算法
    TuplesortMethod sortMethod; /* sort algorithm used */
    //排序使用空间类型
    TuplesortSpaceType spaceType;    /* type of space spaceUsed represents */
    //空间消耗(以K为单位)
    long        spaceUsed;        /* space consumption, in kB */
} TuplesortInstrumentation;

二、源码解读

tuplesort_begin_heap和tuplesort_puttupleslot都是准备工作,把tuple放到数组(堆)中,为后续的实际排序实现作准备.
tuplesort_begin_heap


Tuplesortstate *
tuplesort_begin_heap(TupleDesc tupDesc,//元组描述符
                     int nkeys, //排序键个数
                     AttrNumber *attNums,//属性编号
                     Oid *sortOperators, //排序操作符
                     Oid *sortCollations,//排序规则
                     bool *nullsFirstFlags,//标记
                     int workMem, //内存大小
                     SortCoordinate coordinate,//协调器 
                     bool randomAccess)//是否随机访问
{
    //获取排序状态
    Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
                                                   randomAccess);
    MemoryContext oldcontext;
    int            i;
    oldcontext = MemoryContextSwitchTo(state->sortcontext);
    AssertArg(nkeys > 0);
#ifdef TRACE_SORT
    if (trace_sort)
        elog(LOG,
             "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
             nkeys, workMem, randomAccess ? 't' : 'f');
#endif
    state->nKeys = nkeys;
    TRACE_POSTGRESQL_SORT_START(HEAP_SORT,
                                false,    /* no unique check */
                                nkeys,
                                workMem,
                                randomAccess,
                                PARALLEL_SORT(state));
    //设置运行状态
    state->comparetup = comparetup_heap;
    state->copytup = copytup_heap;
    state->writetup = writetup_heap;
    state->readtup = readtup_heap;
    //假定不需要拷贝元组描述符
    state->tupDesc = tupDesc;    /* assume we need not copy tupDesc */
    state->abbrevNext = 10;
    /* Prepare SortSupport data for each column */
    //为每一列准备SortSupport数据(分配内存空间)
    state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
    for (i = 0; i < nkeys; i++)
    {
        //------- 遍历排序键
        //排序键
        SortSupport sortKey = state->sortKeys + i;
        AssertArg(attNums[i] != 0);
        AssertArg(sortOperators[i] != 0);
        //设置SortSupport
        sortKey->ssup_cxt = CurrentMemoryContext;
        sortKey->ssup_collation = sortCollations[i];
        sortKey->ssup_nulls_first = nullsFirstFlags[i];
        sortKey->ssup_attno = attNums[i];
        /* Convey if abbreviation optimization is applicable in principle */
        //缩写优化是否原则上可用?
        sortKey->abbreviate = (i == 0);
        //设置
        PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
    }
    /*
     * The "onlyKey" optimization cannot be used with abbreviated keys, since
     * tie-breaker comparisons may be required.  Typically, the optimization
     * is only of value to pass-by-value types anyway, whereas abbreviated
     * keys are typically only of value to pass-by-reference types.
     * 对于缩写优化"onlyKey"优化不能使用,因为可能需要tie-breaker比较.
     * 典型的,优化器只对按值传递类型有价值,而缩写建通常只对引用传递类型有价值.
     */
    if (nkeys == 1 && !state->sortKeys->abbrev_converter)
        state->onlyKey = state->sortKeys;
    MemoryContextSwitchTo(oldcontext);
    return state;
}

tuplesort_puttupleslot
接收一个元组(一行)


/*
 * Accept one tuple while collecting input data for sort.
 * 接收一个元组(一行)
 *
 * Note that the input data is always copied; the caller need not save it.
 * 注意输入数据通常会被拷贝,调用者不需要保存这些数据.
 */
void
tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot)
{
    MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
    SortTuple    stup;
    /*
     * Copy the given tuple into memory we control, and decrease availMem.
     * Then call the common code.
     * 拷贝给定的元组在我们控制的内存中,同时减少可用内存.
     * 然后调用puttuple_common函数.
     */
    //#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup))
    COPYTUP(state, &stup, (void *) slot);
    puttuple_common(state, &stup);
    MemoryContextSwitchTo(oldcontext);
}
/*
 * Shared code for tuple and datum cases.
 * 元组和datum共享的代码.
 */
static void
puttuple_common(Tuplesortstate *state, SortTuple *tuple)
{
    Assert(!LEADER(state));
    switch (state->status)
    {
        case TSS_INITIAL://初始化
            /*
             * Save the tuple into the unsorted array.  First, grow the array
             * as needed.  Note that we try to grow the array when there is
             * still one free slot remaining --- if we fail, there'll still be
             * room to store the incoming tuple, and then we'll switch to
             * tape-based operation.
             * 在未排序的数组中存储元组.
             * 首先,如需要则扩展数组大小.注意在只有1个slot剩下来的时候才尝试扩展数组.
             * 假如扩展失败,仍有空间可用于存储输入的元组,然后将切换至tape-based(基于磁带)操作.
             */
            if (state->memtupcount >= state->memtupsize - 1)
            {
                (void) grow_memtuples(state);
                Assert(state->memtupcount < state->memtupsize);
            }
            state->memtuples[state->memtupcount++] = *tuple;//在数组中保存元组
            /*
             * Check if it's time to switch over to a bounded heapsort. We do
             * so if the input tuple count exceeds twice the desired tuple
             * count (this is a heuristic for where heapsort becomes cheaper
             * than a quicksort), or if we've just filled workMem and have
             * enough tuples to meet the bound.
             * 检查是否切换至有界heapsort.
             * 在输入元组个数超出期望元组个数两倍的情况下执行该检查
             * (在堆排序比快速排序成本更低时所获得的洞见),
             *   或者如果我们已经填充了workMem并且有足够的元组满足bound时也执行该检查.
             *
             * Note that once we enter TSS_BOUNDED state we will always try to
             * complete the sort that way.  In the worst case, if later input
             * tuples are larger than earlier ones, this might cause us to
             * exceed workMem significantly.
             * 注意我们一旦进入TSS_BOUNDED状态,将尝试按指定的方式完成排序.
             * 在最坏的情况下,如果后续的输入元组比早前的要大,这可能会导致内存大小会超出workMem.
             */
            if (state->bounded &&
                (state->memtupcount > state->bound * 2 ||
                 (state->memtupcount > state->bound && LACKMEM(state))))
            {
#ifdef TRACE_SORT
                if (trace_sort)
                    elog(LOG, "switching to bounded heapsort at %d tuples: %s",
                         state->memtupcount,
                         pg_rusage_show(&state->ru_start));
#endif
                //切换至堆排序
                make_bounded_heap(state);
                return;
            }
            /*
             * Done if we still fit in available memory and have array slots.
             * 如果内存空间可用并且数组有位置存储,则已完成任务!
             */
            if (state->memtupcount < state->memtupsize && !LACKMEM(state))
                return;
            /*
             * Nope; time to switch to tape-based operation.
             * 切换至tape-base操作
             */
            inittapes(state, true);
            /*
             * Dump all tuples.
             * dump所有元组
             */
            dumptuples(state, false);
            break;
        case TSS_BOUNDED://有界堆排序
            /*
             * We don't want to grow the array here, so check whether the new
             * tuple can be discarded before putting it in.  This should be a
             * good speed optimization, too, since when there are many more
             * input tuples than the bound, most input tuples can be discarded
             * with just this one comparison.  Note that because we currently
             * have the sort direction reversed, we must check for <= not >=.
             * 不希望在这里扩展数组,因此检查在放到数组前检查是否可用废弃某些新元组.
             * 这将是一个很好的性能优化,因为存在比边界要大得多的输入元组,
             *   大多数输入元组会通过对比被废弃.
             */
            if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0)
            {
                /* new tuple <= top of the heap, so we can discard it */
                // 新元组 <= 堆顶,可以废弃
                free_sort_tuple(state, tuple);
                CHECK_FOR_INTERRUPTS();
            }
            else
            {
                /* discard top of heap, replacing it with the new tuple */
                //废弃堆顶,使用新元组替换
                free_sort_tuple(state, &state->memtuples[0]);
                tuplesort_heap_replace_top(state, tuple);
            }
            break;
        case TSS_BUILDRUNS://构建运行期信息
            /*
             * Save the tuple into the unsorted array (there must be space)
             * 保存元组到未排序数组中(存在空闲空间)
             */
            state->memtuples[state->memtupcount++] = *tuple;
            /*
             * If we are over the memory limit, dump all tuples.
             * 如果已超出内存限制,则dump所有元组
             */
            dumptuples(state, false);
            break;
        default:
            elog(ERROR, "invalid tuplesort state");
            break;
    }
}

三、跟踪分析

N/A

四、参考资料

N/A

PostgreSQL 源码解读（194）- 查询#110(排序#3 - 实现)

一、数据结构

二、源码解读

三、跟踪分析

四、参考资料

相关阅读