PostgreSQL 源码解读（199）- 查询#114(排序#7 - inittapes&dumptuples) - 关系型数据库

/* * Possible states of a Tuplesort object. These denote the states that * persist between calls of Tuplesort routines. * Tuplesort对象可能的状态. * 这些标示在Tuplesort例程调用之间会持久存在的状态. */ typedef enum { //装载元组,在内存限制之内 TSS_INITIAL, /* Loading tuples; still within memory limit */ //装载元组到有界堆中 TSS_BOUNDED, /* Loading tuples into bounded-size heap */ //装载元组,写入到tape中 TSS_BUILDRUNS, /* Loading tuples; writing to tape */ //完全在内存中完成排序 TSS_SORTEDINMEM, /* Sort completed entirely in memory */ //完成排序,最后在tape上执行排序 TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ //不落地执行最后的归并 TSS_FINALMERGE /* Performing final merge on-the-fly */ } TupSortStatus; /* * Parameters for calculation of number of tapes to use --- see inittapes() * and tuplesort_merge_order(). * 用于计算需要使用多少个tapes的参数.--- 详细参见inittapes()和tuplesort_merge_order(). * * In this calculation we assume that each tape will cost us about 1 blocks * worth of buffer space. This ignores the overhead of all the other data * structures needed for each tape, but it's probably close enough. * 在这个计算中,我们假定每一个tape会大概消耗缓存空间的一个block. * 虽然已经忽略了所有每个tape依赖的其他数据结构,但已经非常接近了. * * MERGE_BUFFER_SIZE is how much data we'd like to read from each input * tape during a preread cycle (see discussion at top of file). * MERGE_BUFFER_SIZE表示在每一轮读周期中我们将要从每个输入taple中读取的数据大小 */ #define MINORDER 6 /* minimum merge order */ #define MAXORDER 500 /* maximum merge order */ #define TAPE_BUFFER_OVERHEAD BLCKSZ #define MERGE_BUFFER_SIZE (BLCKSZ * 32) typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b, Tuplesortstate *state); /* * Private state of a Tuplesort operation. * Tuplesort操作的私有状态. */ struct Tuplesortstate { //状态 : 枚举值详见上面的信息 TupSortStatus status; /* enumerated value as shown above */ //sort key中的列数 int nKeys; /* number of columns in sort key */ //调用者需要随机访问? bool randomAccess; /* did caller request random access? */ //调用者是否指定了最大返回的元组的数目? bool bounded; /* did caller specify a maximum number of * tuples to return? */ //使用有界堆,则返回T bool boundUsed; /* true if we made use of a bounded heap */ //如为有界堆,这里存储最大的元组个数 int bound; /* if bounded, the maximum number of tuples */ //SortTuple.tuple是否可以设置? bool tuples; /* Can SortTuple.tuple ever be set? */ //剩余可用内存大小(单位:字节) int64 availMem; /* remaining memory available, in bytes */ //允许的内存总大小(单位:字节) int64 allowedMem; /* total memory allowed, in bytes */ //tapes个数 int maxTapes; /* number of tapes (Knuth's T) */ //tapes个数 - 1 int tapeRange; /* maxTapes-1 (Knuth's P) */ //主要用于排序数据的内存上下文 MemoryContext sortcontext; /* memory context holding most sort data */ //用于元组数据的sortcontext的子上下文 MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */ //临时文件中tapes的logtape.c对象 LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ /* * These function pointers decouple the routines that must know what kind * of tuple we are sorting from the routines that don't need to know it. * They are set up by the tuplesort_begin_xxx routines. * 这些函数指针将必须知道排序的哪种元组的例程与不需要知道它的例程解耦. * * Function to compare two tuples; result is per qsort() convention, ie: * <0, 0, >0 according as a<b, a=b, a>b. The API must match * qsort_arg_comparator. * 比较两个元组的函数,结果由每个qsort()约定,比如: * < 0, 0, >0代表a<b,a=b,a>b.API必须匹配qsort_arg_comparator. */ SortTupleComparator comparetup; /* * Function to copy a supplied input tuple into palloc'd space and set up * its SortTuple representation (ie, set tuple/datum1/isnull1). Also, * state->availMem must be decreased by the amount of space used for the * tuple copy (note the SortTuple struct itself is not counted). * 该函数用于拷贝一个输入的元组到由palloc分配的内存空间中, * 同时设置SortTuple数据结构(比如设置tuple/datum1/isnull1等). * 同时,state->availMem必须减去用于元组拷贝的空间大小(注意:SortTuple结构体不计算在内). */ void (*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup); /* * Function to write a stored tuple onto tape. The representation of the * tuple on tape need not be the same as it is in memory; requirements on * the tape representation are given below. Unless the slab allocator is * used, after writing the tuple, pfree() the out-of-line data (not the * SortTuple struct!), and increase state->availMem by the amount of * memory space thereby released. * 用于写入元组到taple的函数. * tape中的元组声明不需要与内存中的一致,tape中的声明要求详见下面说明. * 除非使用slab分配器,在写入元组后,pfree() out-of-line的数据(不是SortTuple结构体), * 同时把刚才释放的内存空间加到state->availMem中. */ void (*writetup) (Tuplesortstate *state, int tapenum, SortTuple *stup); /* * Function to read a stored tuple from tape back into memory. 'len' is * the already-read length of the stored tuple. The tuple is allocated * from the slab memory arena, or is palloc'd, see readtup_alloc(). * 从tape中读取元组到内存中的函数. * 'len'是已读取的存储元组的长度.元组在slab内存空间/palloc中分配,详细参考readtup_alloc()函数 */ void (*readtup) (Tuplesortstate *state, SortTuple *stup, int tapenum, unsigned int len); /* * This array holds the tuples now in sort memory. If we are in state * INITIAL, the tuples are in no particular order; if we are in state * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS * and FINALMERGE, the tuples are organized in "heap" order per Algorithm * H. In state SORTEDONTAPE, the array is not used. * 该数组保存排序内存中的元组.当前状态为 * INITIAL:元组没有特定的顺序; * SORTEDINMEM:元组处于最终已排序的状态; * BUILDRUNS/FINALMERGE:元组按算法H的'堆'顺序组织. * SORTEDONTAPE:数组未使用. */ //SortTuple结构体数组 SortTuple *memtuples; /* array of SortTuple structs */ //当前存在的元组数 int memtupcount; /* number of tuples currently present */ //memtuples数组的已分配的大小 int memtupsize; /* allocated length of memtuples array */ //memtuples的增长仍在进行中? bool growmemtuples; /* memtuples' growth still underway? */ /* * Memory for tuples is sometimes allocated using a simple slab allocator, * rather than with palloc(). Currently, we switch to slab allocation * when we start merging. Merging only needs to keep a small, fixed * number of tuples in memory at any time, so we can avoid the * palloc/pfree overhead by recycling a fixed number of fixed-size slots * to hold the tuples. * 有时候元组的内存分配使用简单的slab分配器实现而不是palloc(). * 在开始归并时,同步切换至slab分配器.归并只需要在内存中保持简单/固定的元组数目, * 因此可以避免频繁回收固定数目固定大小的slots(用于保存元组)而导致的palloc/pfree过载. * * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE * slots. The allocation is sized to have one slot per tape, plus one * additional slot. We need that many slots to hold all the tuples kept * in the heap during merge, plus the one we have last returned from the * sort, with tuplesort_gettuple. * 对于slab,使用大型分配器,拆分为SLAB_SLOT_SIZE个大小的slots. * 分配器的大小为每个tape一个slot,外加一个为的slot. * 我们需要这么多slots是因为在归并期间需要保存所有在堆中的元组,外加tuplesort_gettuple从排序中最终返回的元组 * * Initially, all the slots are kept in a linked list of free slots. When * a tuple is read from a tape, it is put to the next available slot, if * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd * instead. * 一开始,所有的slots在空闲slots中以链表的方式保存. * 从tape中读取元组时,如合适的话,元组会放到下一个可用slot中. * 如果元组比SLAB_SLOT_SIZE要大,改用palloc分配内存空间. * * When we're done processing a tuple, we return the slot back to the free * list, or pfree() if it was palloc'd. We know that a tuple was * allocated from the slab, if its pointer value is between * slabMemoryBegin and -End. * 如果已完成元组的处理,返回slot到空闲链表中,如果使用palloc分配则使用pfree回收空间. * 如果元组指针值在slabMemoryBegin和slabMemoryEnd之间,那么我们可以知道元组是从slab中分配的. * * When the slab allocator is used, the USEMEM/LACKMEM mechanism of * tracking memory usage is not used. * 如使用了slab分配器,不会使用USEMEM/LACKMEM机制跟踪内存使用. */ bool slabAllocatorUsed; //slab内存空间的起始位置 char *slabMemoryBegin; /* beginning of slab memory arena */ //slab内存空间的结束位置 char *slabMemoryEnd; /* end of slab memory arena */ //链表头 SlabSlot *slabFreeHead; /* head of free list */ /* Buffer size to use for reading input tapes, during merge. */ //在归并期间用于读取输入tapes的缓存大小 size_t read_buffer_size; /* * When we return a tuple to the caller in tuplesort_gettuple_XXX, that * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE * modes), we remember the tuple in 'lastReturnedTuple', so that we can * recycle the memory on next gettuple call. * 通过tuplesort_gettuple_XXX方法调用返回元组给调用者时,元组从tape中获取 * (也就是说,TSS_SORTEDONTAPE/TSS_FINALMERGE模式),这时候会把元组放在'lastReturnedTuple'中, * 因此可在下次gettuple调用中回收内存. */ void *lastReturnedTuple; /* * While building initial runs, this is the current output run number. * Afterwards, it is the number of initial runs we made. * 在构建initial运行时,这是当前输出的run编号. * 后续这是我们构建好的initial runs的编号. */ int currentRun; /* * Unless otherwise noted, all pointer variables below are pointers to * arrays of length maxTapes, holding per-tape data. * 除非特别注意,下面所有的指针变量是指向长度为maxTapes的数组,保存per-tape数据. */ /* * This variable is only used during merge passes. mergeactive[i] is true * if we are reading an input run from (actual) tape number i and have not * yet exhausted that run. * 该变量在归并过程中使用. * mergeactive[i]为T如果我们从编号为i的tape中读取数据,仍未在该run中消耗完毕. */ //活跃的输入run源? bool *mergeactive; /* active input run source? */ /* * Variables for Algorithm D. Note that destTape is a "logical" tape * number, ie, an index into the tp_xxx[] arrays. Be careful to keep * "logical" and "actual" tape numbers straight! * 用于算法D的变量. * 注意destTape是一个逻辑tape编号,例如,是指向tp_xxx[]数组的索引. * 注意保持"逻辑"和"实际"tape编号的连续性. */ //Knuth's l int Level; /* Knuth's l */ //当前输出tape(Knuth's j) int destTape; /* current output tape (Knuth's j, less 1) */ //目标斐波拉契run计数(A[]) int *tp_fib; /* Target Fibonacci run counts (A[]) */ //每一个tape上真正runs的编号 int *tp_runs; /* # of real runs on each tape */ //每一个tape(D[])上虚拟runs的编号 int *tp_dummy; /* # of dummy runs for each tape (D[]) */ //实际的tape编号(TAPE[]) int *tp_tapenum; /* Actual tape numbers (TAPE[]) */ //归并轮中的活动输入tapes编号 int activeTapes; /* # of active input tapes in merge pass */ /* * These variables are used after completion of sorting to keep track of * the next tuple to return. (In the tape case, the tape's current read * position is also critical state.) * 这些变量用于在排序完成后保持下一个返回元组时的跟踪. * (在tape情况下,tape的当前读取位置也是重要的状态) */ //已完成输出的实际tape编号 int result_tape; /* actual tape number of finished output */ //数组编号(仅用于SORTEDINMEM) int current; /* array index (only used if SORTEDINMEM) */ //是否到达EOF(用于游标) bool eof_reached; /* reached EOF (needed for cursors) */ /* markpos_xxx holds marked position for mark and restore */ //markpos_xxx保持已标记的位置,用于标记和存储 //tape block编号(只用于SORTEDONTAPE) long markpos_block; /* tape block# (only used if SORTEDONTAPE) */ //存储的"current",或者tape块中的偏移 int markpos_offset; /* saved "current", or offset in tape block */ //存储的eof_reached bool markpos_eof; /* saved "eof_reached" */ /* * These variables are used during parallel sorting. * 这些变量用于并行排序. * * worker is our worker identifier. Follows the general convention that * -1 value relates to a leader tuplesort, and values >= 0 worker * tuplesorts. (-1 can also be a serial tuplesort.) * worker是worker标识符ID. * 遵循一般约定，-1值与leader tuplesort相关，并且值>= 0表示worker tuplesorts。 * (在串行tuplesort时,-1也可以表示这种情况) * * shared is mutable shared memory state, which is used to coordinate * parallel sorts. * shared是可变的共享内存状态,用于协调并行排序. * * nParticipants is the number of worker Tuplesortstates known by the * leader to have actually been launched, which implies that they must * finish a run leader can merge. Typically includes a worker state held * by the leader process itself. Set in the leader Tuplesortstate only. * nParticipants是已知的worker Tuplesortstates的数目,这些worker由leader感知,是实际启动的worker数, * 这也意味着在leader可以归并前这些worker必须完成. * 典型的,leader进行自身包含至少一个worker状态. * 只在leader的Tuplesortstate中设置. */ int worker; Sharedsort *shared; int nParticipants; /* * The sortKeys variable is used by every case other than the hash index * case; it is set by tuplesort_begin_xxx. tupDesc is only used by the * MinimalTuple and CLUSTER routines, though. * sortKeys变量用于every而不是hash index,通过tuplesort_begin_xxx设置. * tupDesc只由MinimalTuple和CLUSTER例程使用. */ TupleDesc tupDesc; //长度nKeys数组 SortSupport sortKeys; /* array of length nKeys */ /* * This variable is shared by the single-key MinimalTuple case and the * Datum case (which both use qsort_ssup()). Otherwise it's NULL. * 该变量在单键MinimalTuple和Datum情况下(使用qsort_ssup()函数)共享使用,否则的话值为NULL. */ SortSupport onlyKey; /* * Additional state for managing "abbreviated key" sortsupport routines * (which currently may be used by all cases except the hash index case). * Tracks the intervals at which the optimization's effectiveness is * tested. * 管理"缩写键"sortsupport过程的额外状态. * (除了hash index外会被其他情况使用) * 跟踪在优化器有效性测试时时间间隔. */ int64 abbrevNext; /* Tuple # at which to next check * applicability */ /* * These variables are specific to the CLUSTER case; they are set by * tuplesort_begin_cluster. * 这些变量仅在CLUSTER时生效,通过tuplesort_begin_cluster设置. */ //将用于依赖的索引信息 IndexInfo *indexInfo; /* info about index being used for reference */ //解析索引表达式的运行期状态 EState *estate; /* for evaluating index expressions */ /* * These variables are specific to the IndexTuple case; they are set by * tuplesort_begin_index_xxx and used only by the IndexTuple routines. * 这些变量仅用于IndexTuple. * 通过tuplesort_begin_index_xxx设置,仅用于IndexTuple例程. */ //数据表 Relation heapRel; /* table the index is being built on */ //正在创建的index Relation indexRel; /* index being built */ /* These are specific to the index_btree subcase: */ //这些仅在index_btree下使用 //如发现重复元组,则提示 bool enforceUnique; /* complain if we find duplicate tuples */ /* These are specific to the index_hash subcase: */ //index_hash情况 uint32 high_mask; /* masks for sortable part of hash code */ uint32 low_mask; uint32 max_buckets; /* * These variables are specific to the Datum case; they are set by * tuplesort_begin_datum and used only by the DatumTuple routines. * 这些变量用于Datum,通过tuplesort_begin_datum设置,仅用于DatumTuple例程. */ Oid datumType; /* we need typelen in order to know how to copy the Datums. */ //需要typelen用于知道如何拷贝Datums. int datumTypeLen; /* * Resource snapshot for time of sort start. * 在排序开始时的资源快照 */ #ifdef TRACE_SORT PGRUsage ru_start; #endif };

/* * inittapes - initialize for tape sorting. * inittapes - 初始化tape sorting(Polyphase Merging). * * This is called only if we have found we won't sort in memory. * 在内存不足以满足排序需求时才调用此函数. */ static void inittapes(Tuplesortstate *state, bool mergeruns) { int maxTapes,//最大tapes j; Assert(!LEADER(state)); if (mergeruns) { /* Compute number of tapes to use: merge order plus 1 */ //计算tapes数 : 归并顺序 + 1 /* #define MINORDER 6 #define MAXORDER 500 #define TAPE_BUFFER_OVERHEAD BLCKSZ #define MERGE_BUFFER_SIZE (BLCKSZ * 32) mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) / (MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD); mOrder = Max(mOrder, MINORDER); mOrder = Min(mOrder, MAXORDER); */ maxTapes = tuplesort_merge_order(state->allowedMem) + 1; } else { /* Workers can sometimes produce single run, output without merge */ //Worker进程有时可能产生单个run,不需要归并直接输出. Assert(WORKER(state)); maxTapes = MINORDER + 1; } #ifdef TRACE_SORT if (trace_sort) elog(LOG, "worker %d switching to external sort with %d tapes: %s", state->worker, maxTapes, pg_rusage_show(&state->ru_start)); #endif /* Create the tape set and allocate the per-tape data arrays */ //创建tape集合并分配per-tape数据数组 inittapestate(state, maxTapes); state->tapeset = LogicalTapeSetCreate(maxTapes, NULL, state->shared ? &state->shared->fileset : NULL, state->worker); state->currentRun = 0; /* * Initialize variables of Algorithm D (step D1). * 初始化算法D的变量(step D1) */ for (j = 0; j < maxTapes; j++) { state->tp_fib[j] = 1; state->tp_runs[j] = 0; state->tp_dummy[j] = 1; state->tp_tapenum[j] = j; } state->tp_fib[state->tapeRange] = 0; state->tp_dummy[state->tapeRange] = 0; state->Level = 1; state->destTape = 0; //变更状态为TSS_BUILDRUNS state->status = TSS_BUILDRUNS; }

/* * dumptuples - remove tuples from memtuples and write initial run to tape * 清除memtuples中的元组并写入初始run到tape中 * * When alltuples = true, dump everything currently in memory. (This case is * only used at end of input data.) * 如alltuples为T,dump内存中的所有数据. * (仅适用于与输入数据结束时) */ static void dumptuples(Tuplesortstate *state, bool alltuples) { int memtupwrite; int i; /* * Nothing to do if we still fit in available memory and have array slots, * unless this is the final call during initial run generation. * 如果可以放入可用内存中并且仍有数据slots,并且当前不是在初始run产生时的最后一次调用,则返回 */ if (state->memtupcount < state->memtupsize && !LACKMEM(state) && !alltuples) return; /* * Final call might require no sorting, in rare cases where we just so * happen to have previously LACKMEM()'d at the point where exactly all * remaining tuples are loaded into memory, just before input was * exhausted. * 最后一次调用可能不需要排序,在极少数情况下, * 我们恰好在输入耗尽之前调用了LACKMEM()'d,此时所有剩余的元组都被加载到内存中. * * In general, short final runs are quite possible. Rather than allowing * a special case where there was a superfluous selectnewtape() call (i.e. * a call with no subsequent run actually written to destTape), we prefer * to write out a 0 tuple run. * 通常来说,最后的runs很有可能较短. * 与其允许存在一个额外的selectnewtape()函数调用(即没有后续运行实际写入到destTape的调用), * 还不如编写一个0元组的run. * * mergereadnext() is prepared for 0 tuple runs, and will reliably mark * the tape inactive for the merge when called from beginmerge(). This * case is therefore similar to the case where mergeonerun() finds a dummy * run for the tape, and so doesn't need to merge a run from the tape (or * conceptually "merges" the dummy run, if you prefer). According to * Knuth, Algorithm D "isn't strictly optimal" in its method of * distribution and dummy run assignment; this edge case seems very * unlikely to make that appreciably worse. * mergereadnext()为0元组runs作准备,在beginmerge()函数调用该函数时将标记tape为非活动状态. * 这种情况与mergeonerun()为tape检索到虚拟run类似,因此不需要归并(如果你愿意,可以执行名义上的归并). * 按照Knuth的说法,算法D不是严格分布和虚拟run分配优化的,但这种极端的情况不太可能让情况很糟糕. */ Assert(state->status == TSS_BUILDRUNS); /* * It seems unlikely that this limit will ever be exceeded, but take no * chances * 越界了. */ if (state->currentRun == INT_MAX) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("cannot have more than %d runs for an external sort", INT_MAX))); state->currentRun++; #ifdef TRACE_SORT if (trace_sort) elog(LOG, "worker %d starting quicksort of run %d: %s", state->worker, state->currentRun, pg_rusage_show(&state->ru_start)); #endif /* * Sort all tuples accumulated within the allowed amount of memory for * this run using quicksort * 使用快速排序对内存中的元组进行排序. */ tuplesort_sort_memtuples(state); #ifdef TRACE_SORT if (trace_sort) elog(LOG, "worker %d finished quicksort of run %d: %s", state->worker, state->currentRun, pg_rusage_show(&state->ru_start)); #endif //写入到tape中 memtupwrite = state->memtupcount; for (i = 0; i < memtupwrite; i++) { WRITETUP(state, state->tp_tapenum[state->destTape], &state->memtuples[i]); state->memtupcount--; } /* * Reset tuple memory. We've freed all of the tuples that we previously * allocated. It's important to avoid fragmentation when there is a stark * change in the sizes of incoming tuples. Fragmentation due to * AllocSetFree's bucketing by size class might be particularly bad if * this step wasn't taken. * 重置tuple内存上下文. * 目的是为了避免内存碎片. */ MemoryContextReset(state->tuplecontext); markrunend(state, state->tp_tapenum[state->destTape]); state->tp_runs[state->destTape]++; state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ #ifdef TRACE_SORT if (trace_sort) elog(LOG, "worker %d finished writing run %d to tape %d: %s", state->worker, state->currentRun, state->destTape, pg_rusage_show(&state->ru_start)); #endif //未完成所有元组的处理,分配新的tape if (!alltuples) selectnewtape(state); }

您好，登录后才能下订单哦！

密码登录

忘记密码？

登录注册

获取短信验证码

其他方式登录

点击登录注册即表示同意《亿速云用户服务条款》

PostgreSQL 源码解读（199）- 查询#114(排序#7 - inittapes&dumptuples)

一、数据结构

二、源码解读

三、跟踪分析

四、参考资料

相关阅读