PostgreSQL 的pg_buffercache 代码研究

更新时间：2022-09-16 16:36:29

pg_buffercache 代码位于 contrib 目录，总体上代码量200多行。

刚接触，感觉直接访问PostgreSQL 中的内存结构很神奇，特意学习了一下。

/*-------------------------------------------------------------------------                        
 *                        
 * pg_buffercache_pages.c                        
 *      display some contents of the buffer cache                    
 *                        
 *      contrib/pg_buffercache/pg_buffercache_pages.c                    
 *-------------------------------------------------------------------------                        
 */                        
#include "postgres.h"                        
                        
#include "catalog/pg_type.h"                        
#include "funcapi.h"                        
#include "storage/buf_internals.h"                        
#include "storage/bufmgr.h"                        
                        
                        
#define NUM_BUFFERCACHE_PAGES_ELEM    8                    
                        
PG_MODULE_MAGIC;                        
                        
Datum    pg_buffercache_pages(PG_FUNCTION_ARGS);                    
                        
                        
/*                        
 * Record structure holding the to be exposed cache data.                        
 */                        
typedef struct                        
{                        
    uint32        bufferid;            
    Oid        relfilenode;            
    Oid        reltablespace;            
    Oid        reldatabase;            
    ForkNumber        forknum;            
    BlockNumber         blocknum;            
    bool        isvalid;            
    bool        isdirty;            
    uint16        usagecount;            
} BufferCachePagesRec;                        
                        
                        
/*                        
 * Function context for data persisting over repeated calls.                        
 */                        
typedef struct                        
{                        
    TupleDesc    tupdesc;                
    BufferCachePagesRec *record;                    
} BufferCachePagesContext;                        
                        
                        
/*                        
 * Function returning data from the shared buffer cache - buffer number,                        
 * relation node/tablespace/database/blocknum and dirty indicator.                        
 */                        
PG_FUNCTION_INFO_V1(pg_buffercache_pages);                        
                        
Datum                        
pg_buffercache_pages(PG_FUNCTION_ARGS)                        
{                        
    FuncCallContext *funcctx;                    
    Datum        result;            
    MemoryContext oldcontext;                    
    BufferCachePagesContext *fctx;                /* User function context. */    
    TupleDesc        tupledesc;            
    HeapTuple        tuple;            
                        
    if (SRF_IS_FIRSTCALL())                    
    {                    
        int    i;            
        volatile BufferDesc *bufHdr;                
                        
        funcctx = SRF_FIRSTCALL_INIT();                
                        
        /* Switch context when allocating stuff to be used in later calls */                
        oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);                
                        
        /* Create a user function context for cross-call persistence */                
        fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext));                
                        
        /* Construct a tuple descriptor for the result rows. */                
        tupledesc = CreateTemplateTupleDesc(NUM_BUFFERCACHE_PAGES_ELEM, false);                
        TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",                
                           INT4OID, -1, 0);
        TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",                
                           OIDOID, -1, 0);
        TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",                
                           OIDOID, -1, 0);
        TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",                
                           OIDOID, -1, 0);
        TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",                
                           INT2OID, -1, 0);
        TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",                
                           INT8OID, -1, 0);
        TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty",                
                           BOOLOID, -1, 0);
        TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",                
                           INT2OID, -1, 0);
                        
        fctx->tupdesc = BlessTupleDesc(tupledesc);                
                        
        /* Allocate NBuffers worth of BufferCachePagesRec records. */                
        fctx->record = (BufferCachePagesRec *) palloc(sizeof(BufferCachePagesRec) * NBuffers);                
                        
        /* Set max calls and remember the user function context. */                
        funcctx->max_calls = NBuffers;                
        funcctx->user_fctx = fctx;                
                        
        /* Return to original context when allocating transient memory */                
        MemoryContextSwitchTo(oldcontext);                
                        
        /*                
         * To get a consistent picture of the buffer state, we must lock all                
         * partitions of the buffer map.  Needless to say, this is horrible                
         * for concurrency.  Must grab locks in increasing order to avoid                
         * possible deadlocks.                
         */                
        for (i = 0; i < NUM_BUFFER_PARTITIONS; i++)                
            LWLockAcquire(FirstBufMappingLock + i, LW_SHARED);            
                        
        /*                
         * Scan though all the buffers, saving the relevant fields in the                
         * fctx->record structure.                
         */                
        for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++)                
        {                
            /* Lock each buffer header before inspecting. */            
            LockBufHdr(bufHdr);            
                        
            fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);            
            fctx->record[i].relfilenode = bufHdr->tag.rnode.relNode;            
            fctx->record[i].reltablespace = bufHdr->tag.rnode.spcNode;            
            fctx->record[i].reldatabase = bufHdr->tag.rnode.dbNode;            
            fctx->record[i].forknum = bufHdr->tag.forkNum;            
            fctx->record[i].blocknum = bufHdr->tag.blockNum;            
            fctx->record[i].usagecount = bufHdr->usage_count;            
                        
            if (bufHdr->flags & BM_DIRTY)            
                fctx->record[i].isdirty = true;        
            else            
                fctx->record[i].isdirty = false;        
                        
            /* Note if the buffer is valid, and has storage created */            
            if ((bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_TAG_VALID))            
                fctx->record[i].isvalid = true;        
            else            
                fctx->record[i].isvalid = false;        
                        
            UnlockBufHdr(bufHdr);            
        }                
                        
        /*                
         * And release locks.  We do this in reverse order for two reasons:                
         * (1) Anyone else who needs more than one of the locks will be trying                
         * to lock them in increasing order; we don't want to release the                
         * other process until it can get all the locks it needs. (2) This                
         * avoids O(N^2) behavior inside LWLockRelease.                
         */                
        for (i = NUM_BUFFER_PARTITIONS; --i >= 0;)                
            LWLockRelease(FirstBufMappingLock + i);            
    }                    
                        
    funcctx = SRF_PERCALL_SETUP();                    
                        
    /* Get the saved state */                    
    fctx = funcctx->user_fctx;                    
                        
    if (funcctx->call_cntr < funcctx->max_calls)                    
    {                    
        uint32        i = funcctx->call_cntr;        
        Datum        values[NUM_BUFFERCACHE_PAGES_ELEM];        
        bool        nulls[NUM_BUFFERCACHE_PAGES_ELEM];        
                        
        values[0] = Int32GetDatum(fctx->record[i].bufferid);                
        nulls[0] = false;                
                        
        /*                
         * Set all fields except the bufferid to null if the buffer is unused                
         * or not valid.                
         */                
        if (fctx->record[i].blocknum == InvalidBlockNumber ||                
            fctx->record[i].isvalid == false)            
        {                
            nulls[1] = true;            
            nulls[2] = true;            
            nulls[3] = true;            
            nulls[4] = true;            
            nulls[5] = true;            
            nulls[6] = true;            
            nulls[7] = true;            
        }                
        else                
        {                
            values[1] = ObjectIdGetDatum(fctx->record[i].relfilenode);            
            nulls[1] = false;            
            values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);            
            nulls[2] = false;            
            values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase);            
            nulls[3] = false;            
            values[4] = ObjectIdGetDatum(fctx->record[i].forknum);            
            nulls[4] = false;            
            values[5] = Int64GetDatum((int64) fctx->record[i].blocknum);            
            nulls[5] = false;            
            values[6] = BoolGetDatum(fctx->record[i].isdirty);            
            nulls[6] = false;            
            values[7] = Int16GetDatum(fctx->record[i].usagecount);            
            nulls[7] = false;            
        }                
                        
        /* Build and return the tuple. */                
        tuple = heap_form_tuple(fctx->tupdesc, values, nulls);                
        result = HeapTupleGetDatum(tuple);                
                        
        SRF_RETURN_NEXT(funcctx, result);                
    }                                
        SRF_RETURN_DONE(funcctx);                
           
}

我的看法是这样的：

官方给的例子是这样说的：

Datum
my_set_returning_function(PG_FUNCTION_ARGS)
{
  FuncCallContext *funcctx;
  Datum result;
  MemoryContext oldcontext;
  further declarations as needed if (SRF_IS_FIRSTCALL()) {
    funcctx = SRF_FIRSTCALL_INIT();
    oldcontext =
        MemoryContextSwitchTo(funcctx->
                              multi_call_memory_ctx);
    /* One-time setup code appears here: */
    <<user code>>
    <<if returning composite>>
    <<build TupleDesc, and perhaps AttInMetadata>>
    <<endif returning composite>>
    <<user code>>
    MemoryContextSwitchTo(oldcontext);
  }

  /* Each-time setup code appears here: */
  <<user code>>
  funcctx = SRF_PERCALL_SETUP();
  <<user code>>

  /* this is just one way we might test whether we are
     done: */
  if (funcctx->call_cntr < funcctx->max_calls) {
    /* Here we want to return another item: */
    <<user code>>
    <<obtain result Datum>>
    SRF_RETURN_NEXT(funcctx, result);
  } else {
    /* Here we are done returning items and just need to
       clean up: */
    <<user code>>
    SRF_RETURN_DONE(funcctx);
  }
}

对于其中的 call_cntr 和 max_calls ，不是太理解。原来想，是否一次就该结束了。但是验证的结果是：其实还是会被调用多次。

[作者：技术者高健@博客园 mail: luckyjackgao@gmail.com]

修改代码后部为这个样子：

……                            
if (funcctx->call_cntr < funcctx->max_calls)                            
{                            
    fprintf(stderr,"!!!!!call_cntr is smaller than max_calls");                        
    ……                        
    SRF_RETURN_NEXT(funcctx, result);                        
}                            
else{                            
    fprintf(stderr,"call_cntr is not smaller than max_calls");                        
    SRF_RETURN_DONE(funcctx);                        
}

运行结果：出现多次 !!!!!call_cntr is smaller than max_calls ，最后出现一次 call_cntr smaller than max_calls

哪怕是我用 select bufferid from pg_buffercache limit 1; 也是一样的效果。

再次测试，可以发现， max_calls 居然为4096，也可以说，我们在 psql 中对 pg_buffercache 的一次普通查询，在后台进行了4096次运转。好古怪！

如果psql 中运行 select count(*) from pg_buffercache; 得出结果正好是 4096。

然后我们再看 NBuffers 的值，在程序中下列循环之前，打印 NBuffers的值，发现只执行一次：值也是 4096

/*
* Scan though all the buffers, saving the relevant fields in the
* fctx->record structure.
*/
for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++)
{

...

}

那么可以说：

  /* Each-time setup code appears here: */
  <<user code>>
  funcctx = SRF_PERCALL_SETUP();
  <<user code>>

之前的代码，只执行一次。后面的代码，虽然我们没有写循环，但是会循环（因为 SRF_RETURN_NEXT）直到我们的代码执行了 SRF_RETURN_DONE为止。

上一篇 : ：Oracle数据库的非归档模式迁移到归档模式下一篇 : sqlplus 中各列对齐设定

PostgreSQL 的pg_buffercache 代码研究

相关阅读

推荐文章