redis 内部数据结构

sds

redis 内部所有的字符串都使用自己实现的 sds(simple dynamic string)来存储。

struct __attribute__ ((__packed__)) sdshdr8 {
    uint8_t len; /* used */
    uint8_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};

len: 记录字符串的长度，避免使用\0作为字符串结束符，让字符串可以存任何的二进制数据。
alloc: 记录申请的内存长度，因为申请内存的操作效率很低，每次都多申请一些。
- 内存管理机制：Linux 使用的是虚拟内存管理机制，需要进行页表的管理和地址映射，这会引入一定的开销。
- 内存分配算法：Linux 使用的内存分配算法可能会导致碎片化，影响内存的利用率和分配效率。
- 内核态和用户态切换：在申请内存时，需要进行内核态和用户态之间的切换，这会增加一定的开销。
flag: 前面 3 位保存字符串类型

#define SDS_TYPE_5  0
#define SDS_TYPE_8  1
#define SDS_TYPE_16 2
#define SDS_TYPE_32 3
#define SDS_TYPE_64 4

buf: 内容

sds 有 4 个定义，分别是 8 位、16 位、32 位、64 位，根据字符串的长度来选择合适的 sds。

struct __attribute__ ((__packed__)) sdshdr5 {
    unsigned char flags; /* 3 lsb of type, and 5 msb of string length */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr8 {
    uint8_t len; /* used */
    uint8_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr16 {
    uint16_t len; /* used */
    uint16_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr32 {
    uint32_t len; /* used */
    uint32_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr64 {
    uint64_t len; /* used */
    uint64_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};

扩容

sds 扩展空间的方式是根据需要选择合适的扩展策略，通常是按照一定的倍数扩展，例如翻倍扩展。因为不是所有字符串都会被频繁修改，比如 key 值就不需要频繁更改，所以这里也保留不使用扩展策略的方法。

扩展逻辑

if (greedy == 1) {
     if (newlen < SDS_MAX_PREALLOC)
         newlen *= 2;
     else
         newlen += SDS_MAX_PREALLOC;
 }

sds _sdsMakeRoomFor(sds s, size_t addlen, int greedy) {
    void *sh, *newsh;
    size_t avail = sdsavail(s);
    size_t len, newlen, reqlen;
    char type, oldtype = s[-1] & SDS_TYPE_MASK;
    int hdrlen;
    size_t usable;

    /* Return ASAP if there is enough space left. */
    if (avail >= addlen) return s;

    len = sdslen(s);
    sh = (char*)s-sdsHdrSize(oldtype);
    reqlen = newlen = (len+addlen);
    assert(newlen > len);   /* Catch size_t overflow */
    if (greedy == 1) {
        if (newlen < SDS_MAX_PREALLOC)
            newlen *= 2;
        else
            newlen += SDS_MAX_PREALLOC;
    }

    type = sdsReqType(newlen);

    /* Don't use type 5: the user is appending to the string and type 5 is
     * not able to remember empty space, so sdsMakeRoomFor() must be called
     * at every appending operation. */
    if (type == SDS_TYPE_5) type = SDS_TYPE_8;

    hdrlen = sdsHdrSize(type);
    assert(hdrlen + newlen + 1 > reqlen);  /* Catch size_t overflow */
    if (oldtype==type) {
        newsh = s_realloc_usable(sh, hdrlen+newlen+1, &usable);
        if (newsh == NULL) return NULL;
        s = (char*)newsh+hdrlen;
    } else {
        /* Since the header size changes, need to move the string forward,
         * and can't use realloc */
        newsh = s_malloc_usable(hdrlen+newlen+1, &usable);
        if (newsh == NULL) return NULL;
        memcpy((char*)newsh+hdrlen, s, len+1);
        s_free(sh);
        s = (char*)newsh+hdrlen;
        s[-1] = type;
        sdssetlen(s, len);
    }
    usable = usable-hdrlen-1;
    if (usable > sdsTypeMaxSize(type))
        usable = sdsTypeMaxSize(type);
    sdssetalloc(s, usable);
    return s;
}

intset

intset 是 redis 内部用来存储整数集合的数据结构，它是一个有序的集合，不允许重复元素。因为底层是二分法查找，比较适合数据量小的场景使用，当数据量大的时候，效率就有所下降了。

typedef struct intset {
    uint32_t encoding;
    uint32_t length;
    int8_t contents[];
} intset;

encoding: 记录 intset 的编码方式，有 3 种

// 16位
#define INTSET_ENC_INT16 (sizeof(int16_t))
// 32位
#define INTSET_ENC_INT32 (sizeof(int32_t))
// 64位
#define INTSET_ENC_INT64 (sizeof(int64_t))

length: 数组的长度
contents: 内容

扩容

intset 的实现是一个数组，数组的每个元素的大小是固定的，根据 encoding 来决定。如果新添加的元素超过了 encoding 的范围，intset 会自动扩容。利用类型升级机制，时间换空间，节省内存空间。

/* Upgrades the intset to a larger encoding and inserts the given integer. */
static intset *intsetUpgradeAndAdd(intset *is, int64_t value) {
    uint8_t curenc = intrev32ifbe(is->encoding);
    uint8_t newenc = _intsetValueEncoding(value);
    int length = intrev32ifbe(is->length);
    // 要让inset重新分配内存的数据，不是比原来的大就是比原来的小
    int prepend = value < 0 ? 1 : 0;

    /* First set new encoding and resize */
    is->encoding = intrev32ifbe(newenc);
    is = intsetResize(is,intrev32ifbe(is->length)+1);

    /* Upgrade back-to-front so we don't overwrite values.
     * Note that the "prepend" variable is used to make sure we have an empty
     * space at either the beginning or the end of the intset. */
    while(length--)
        _intsetSet(is,length+prepend,_intsetGetEncoded(is,length,curenc));

    /* Set the value at the beginning or the end. */
    if (prepend)
        _intsetSet(is,0,value);
    else
        _intsetSet(is,intrev32ifbe(is->length),value);
    is->length = intrev32ifbe(intrev32ifbe(is->length)+1);
    return is;
}

dict

dict 是 redis 内部用来存储键值对的数据结构，底层是一个哈希表。dict 包含三部分，hash 表、hash 节点（dictEntry）、字典（dict）。

typedef struct dictType {
    uint64_t (*hashFunction)(const void *key);
    void *(*keyDup)(dict *d, const void *key);
    void *(*valDup)(dict *d, const void *obj);
    int (*keyCompare)(dict *d, const void *key1, const void *key2);
    void (*keyDestructor)(dict *d, void *key);
    void (*valDestructor)(dict *d, void *obj);
    int (*expandAllowed)(size_t moreMem, double usedRatio);
    /* Flags */
    /* The 'no_value' flag, if set, indicates that values are not used, i.e. the
     * dict is a set. When this flag is set, it's not possible to access the
     * value of a dictEntry and it's also impossible to use dictSetKey(). Entry
     * metadata can also not be used. */
    unsigned int no_value:1;
    /* If no_value = 1 and all keys are odd (LSB=1), setting keys_are_odd = 1
     * enables one more optimization: to store a key without an allocated
     * dictEntry. */
    unsigned int keys_are_odd:1;
    /* TODO: Add a 'keys_are_even' flag and use a similar optimization if that
     * flag is set. */

    /* Allow each dict and dictEntry to carry extra caller-defined metadata. The
     * extra memory is initialized to 0 when allocated. */
    size_t (*dictEntryMetadataBytes)(dict *d);
    size_t (*dictMetadataBytes)(void);
    /* Optional callback called after an entry has been reallocated (due to
     * active defrag). Only called if the entry has metadata. */
    void (*afterReplaceEntry)(dict *d, dictEntry *entry);
} dictType;

struct dictEntry {
    void *key;
    union {
        void *val;
        uint64_t u64;
        int64_t s64;
        double d;
    } v;
    struct dictEntry *next;     /* Next entry in the same hash bucket. */
    void *metadata[];           /* An arbitrary number of bytes (starting at a
                                 * pointer-aligned address) of size as returned
                                 * by dictType's dictEntryMetadataBytes(). */
};

struct dict {
    dictType *type;

    dictEntry **ht_table[2];
    unsigned long ht_used[2];

    long rehashidx; /* rehashing not in progress if rehashidx == -1 */

    /* Keep small vars at end for optimal (minimal) struct padding */
    int16_t pauserehash; /* If >0 rehashing is paused (<0 indicates coding error) */
    signed char ht_size_exp[2]; /* exponent of size. (size = 1<<exp) */

    void *metadata[];           /* An arbitrary number of bytes (starting at a
                                 * pointer-aligned address) of size as defined
                                 * by dictType's dictEntryBytes. */
};

dict

dictType: 存储了各种与 hash table 相关的函数。
ht_table: hash 表。 Redis 哈希表在处理哈希冲突时使用的是链表，这种方法在数据量较大的情况下可能会导致链表过长，影响查找效率。为了解决这个问题，Redis 7.2 引入了两个哈希表的设计，即主哈希表和辅助哈希表。
- 主哈希表：主要用于存储数据，大小通常较小。当主哈希表中的某个桶（bucket）中的链表长度达到一定阈值时，会触发扩容操作，并将该桶中的数据重新哈希到辅助哈希表中。
- 辅助哈希表：用于存储主哈希表中被重新哈希的数据，大小通常较大。辅助哈希表的桶的数量是主哈希表的数倍，这样可以降低哈希冲突的概率，提高查找效率。
ht_used: hash 表中已经使用的桶的数量（hash 表中，已经使用的内存块数量）。
ht_size_exp: hash table 的掩码，一定是（2 ^ n - 1）保证二进制的数据是都是 1
- 用于计算桶的位置 idx = hash(key) & DICTHT_SIZE_MASK(d->ht_size_exp[table])，这样可以保证 idx 的范围是 0~N-1。
rehashidx: 标记当前的 hashtable 是否处于 rehash 状态，
- 如果 rehashidx ==-1，则当前没有处于 rehash 状态
- 如果 rehashidx >=0，表示已经 rehash 多少个桶
pauserehash: 该值大于 0 表示 rehash 终止，小于 0 表示编码错误
metadata: 用于存储额外信息的字段，可以存储一些元数据

dictEntry

key: 指向 key
v: value，可以存储不同类型的数据
- union 是指内存的同一个位置可以存储不同的数据类型，是为了兼容不同类型的 value。
- 当 value 是 uint64_t、int64_t、double 的数据类型的时候可以直接内嵌在 dictentry 中，无需为此分配额外的内存，这样可以节省内存，如果不是的话就要额外申请内存，这里只存放指针。
metadata: 存储额外的信息

next: 采用拉链法解决哈希冲突，哈希表的每个桶中维护一个链表，将哈希冲突的元素存储在同一个桶对应的链表中

初始化哈希表：创建一个大小为 N 的哈希表，每个槽初始化为空链表。

插入元素：对于要插入的元素 key，先计算其哈希值 hash(key)，然后将 key 插入到哈希表的第 hash(key) mod N 个桶对应的链表中。

/* Finds and returns the position within the dict where the provided key should
* be inserted using dictInsertAtPosition if the key does not already exist in
* the dict. If the key exists in the dict, NULL is returned and the optional
* 'existing' entry pointer is populated, if provided. */
void *dictFindPositionForInsert(dict *d, const void *key, dictEntry **existing) {
    unsigned long idx, table;
    dictEntry *he;
    // 使用d.type的hashFunction函数计算key的哈希值
    uint64_t hash = dictHashKey(d, key);
    if (existing) *existing = NULL;
    if (dictIsRehashing(d)) _dictRehashStep(d);

    /* Expand the hash table if needed */
    if (_dictExpandIfNeeded(d) == DICT_ERR)
        return NULL;
    for (table = 0; table <= 1; table++) {
        // idx = hash & (N-1)
        // 只取数组的低位，这样可以保证idx的范围是0~N-1
        idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[table]);
        /* Search if this slot does not already contain the given key */
        he = d->ht_table[table][idx];
        while(he) {
            void *he_key = dictGetKey(he);
            if (key == he_key || dictCompareKeys(d, key, he_key)) {
                if (existing) *existing = he;
                return NULL;
            }
            he = dictGetNext(he);
        }
        if (!dictIsRehashing(d)) break;
    }

    /* If we are in the process of rehashing the hash table, the bucket is
    * always returned in the context of the second (new) hash table. */
    dictEntry **bucket = &d->ht_table[dictIsRehashing(d) ? 1 : 0][idx];
    return bucket;
}

查找元素：对于要查找的元素 key，计算其哈希值 hash(key)，然后在第 hash(key) & (N-1) 个槽对应的链表中查找 key。
删除元素：删除元素的操作也类似，先找到元素所在的链表，然后删除对应的节点。

扩容

当哈希表中的某个桶中的链表长度达到一定阈值时，相当于退化到了链表查询了，这个时候会触发扩容操作，并将该桶中的数据重新哈希到辅助哈希表中。

dict_can_resize有 3 个状态

DICT_RESIZE_ENABLE: 允许扩容
DICT_RESIZE_AVOID: 避免扩容，服务器在执行 BGSAVE / BGREWRITEAOF 时，会将 dict_can_resize 设置为 DICT_RESIZE_AVOID，避免扩容。
DICT_RESIZE_FORBID: 禁止扩容

dict_can_resize == DICT_RESIZE_ENABLE, d->ht_used[0] / DICTHT_SIZE(d->ht_size_exp[0]) 触发扩容
dict_can_resize != DICT_RESIZE_FORBID, d->ht_used[0] / DICTHT_SIZE(d->ht_size_exp[0]) > dict_force_resize_ratio 触发扩容

typedef enum {
    DICT_RESIZE_ENABLE,
    DICT_RESIZE_AVOID,
    DICT_RESIZE_FORBID,
} dictResizeEnable;

/* Expand the hash table if needed */
static int _dictExpandIfNeeded(dict *d)
{
    /* Incremental rehashing already in progress. Return. */
    if (dictIsRehashing(d)) return DICT_OK;

    /* If the hash table is empty expand it to the initial size. */
    if (DICTHT_SIZE(d->ht_size_exp[0]) == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE);

    /* If we reached the 1:1 ratio, and we are allowed to resize the hash
     * table (global setting) or we should avoid it but the ratio between
     * elements/buckets is over the "safe" threshold, we resize doubling
     * the number of buckets. */
    if ((dict_can_resize == DICT_RESIZE_ENABLE &&
         d->ht_used[0] >= DICTHT_SIZE(d->ht_size_exp[0])) ||
        (dict_can_resize != DICT_RESIZE_FORBID &&
         d->ht_used[0] / DICTHT_SIZE(d->ht_size_exp[0]) > dict_force_resize_ratio))
    {
        if (!dictTypeExpandAllowed(d))
            return DICT_OK;
        // 这里后面会变成 d->ht_used[0] << 1
        return dictExpand(d, d->ht_used[0] + 1);
    }
    return DICT_OK;
}

缩容

当(d->ht_used[0] + d->ht_used[1]) / (d->ht_size_exp[0] + d->ht_size_exp[1]) < 0.1 触发缩容。

#define dictSlots(d) (DICTHT_SIZE((d)->ht_size_exp[0])+DICTHT_SIZE((d)->ht_size_exp[1]))
#define dictSize(d) ((d)->ht_used[0]+(d)->ht_used[1])

int htNeedsResize(dict *dict) {
    long long size, used;

    size = dictSlots(dict);
    used = dictSize(dict);
    return (size > DICT_HT_INITIAL_SIZE &&
            (used*100/size < HASHTABLE_MIN_FILL));
}

rehash

通过size计算新 hash 表的大小 _dictNextExp

如果扩容，size = 第一个大于等于 dict.ht[0].used + 1 的 2 ^ n
如果缩容，size = 第一个大于等于 dict.ht[0].used 的 2 ^ n，最小值为 4

创建新的 hash table
标记现在的dict.ht[0]正在 rehash。

如果ht[0]很大，一次性直接 rehash 完，可能会阻塞进程，所以 redis 内部会将这个过程分多次，渐进式完成。
每次增删改查操作都会执行一次 rehash，每次 rehash 原来的ht[0]的一个桶，并且rehashidx++。

/* Expand or create the hash table,
 * when malloc_failed is non-NULL, it'll avoid panic if malloc fails (in which case it'll be set to 1).
 * Returns DICT_OK if expand was performed, and DICT_ERR if skipped. */
int _dictExpand(dict *d, unsigned long size, int* malloc_failed)
{
    if (malloc_failed) *malloc_failed = 0;

    /* the size is invalid if it is smaller than the number of
     * elements already inside the hash table */
    if (dictIsRehashing(d) || d->ht_used[0] > size)
        return DICT_ERR;

    /* the new hash table */
    dictEntry **new_ht_table;
    unsigned long new_ht_used;
    signed char new_ht_size_exp = _dictNextExp(size);

    /* Detect overflows */
    size_t newsize = 1ul<<new_ht_size_exp;
    if (newsize < size || newsize * sizeof(dictEntry*) < newsize)
        return DICT_ERR;

    /* Rehashing to the same table size is not useful. */
    if (new_ht_size_exp == d->ht_size_exp[0]) return DICT_ERR;

    /* Allocate the new hash table and initialize all pointers to NULL */
    if (malloc_failed) {
        new_ht_table = ztrycalloc(newsize*sizeof(dictEntry*));
        *malloc_failed = new_ht_table == NULL;
        if (*malloc_failed)
            return DICT_ERR;
    } else
        new_ht_table = zcalloc(newsize*sizeof(dictEntry*));

    new_ht_used = 0;

    /* Is this the first initialization? If so it's not really a rehashing
     * we just set the first hash table so that it can accept keys. */
    if (d->ht_table[0] == NULL) {
        d->ht_size_exp[0] = new_ht_size_exp;
        d->ht_used[0] = new_ht_used;
        d->ht_table[0] = new_ht_table;
        return DICT_OK;
    }

    /* Prepare a second hash table for incremental rehashing */
    d->ht_size_exp[1] = new_ht_size_exp;
    d->ht_used[1] = new_ht_used;
    d->ht_table[1] = new_ht_table;
    d->rehashidx = 0;
    return DICT_OK;
}

rehash 是需要把ht[0]的桶重新算一遍 hash 映射到ht[1]中。

对每个 dict entry 重新计算 hash

扩容，用新的 hash mask 重新执行一遍 hash 计算
缩容，因为 key 再算 hash 都是一样的，只需要去掉高位的数据即可。

设置 value
将 ht[1]赋值给 ht[0]，释放 ht[0]的内存。

/* Performs N steps of incremental rehashing. Returns 1 if there are still
 * keys to move from the old to the new hash table, otherwise 0 is returned.
 *
 * Note that a rehashing step consists in moving a bucket (that may have more
 * than one key as we use chaining) from the old to the new hash table, however
 * since part of the hash table may be composed of empty spaces, it is not
 * guaranteed that this function will rehash even a single bucket, since it
 * will visit at max N*10 empty buckets in total, otherwise the amount of
 * work it does would be unbound and the function may block for a long time. */
int dictRehash(dict *d, int n) {
    int empty_visits = n*10; /* Max number of empty buckets to visit. */
    unsigned long s0 = DICTHT_SIZE(d->ht_size_exp[0]);
    unsigned long s1 = DICTHT_SIZE(d->ht_size_exp[1]);
    if (dict_can_resize == DICT_RESIZE_FORBID || !dictIsRehashing(d)) return 0;
    if (dict_can_resize == DICT_RESIZE_AVOID &&
        ((s1 > s0 && s1 / s0 < dict_force_resize_ratio) ||
         (s1 < s0 && s0 / s1 < dict_force_resize_ratio)))
    {
        return 0;
    }

    while(n-- && d->ht_used[0] != 0) {
        dictEntry *de, *nextde;

        /* Note that rehashidx can't overflow as we are sure there are more
         * elements because ht[0].used != 0 */
        assert(DICTHT_SIZE(d->ht_size_exp[0]) > (unsigned long)d->rehashidx);
        while(d->ht_table[0][d->rehashidx] == NULL) {
            d->rehashidx++;
            if (--empty_visits == 0) return 1;
        }
        de = d->ht_table[0][d->rehashidx];
        /* Move all the keys in this bucket from the old to the new hash HT */
        while(de) {
            uint64_t h;

            nextde = dictGetNext(de);
            void *key = dictGetKey(de);
            /* Get the index in the new hash table */
            if (d->ht_size_exp[1] > d->ht_size_exp[0]) {
                h = dictHashKey(d, key) & DICTHT_SIZE_MASK(d->ht_size_exp[1]);
            } else {
                /* We're shrinking the table. The tables sizes are powers of
                 * two, so we simply mask the bucket index in the larger table
                 * to get the bucket index in the smaller table. */
                h = d->rehashidx & DICTHT_SIZE_MASK(d->ht_size_exp[1]);
            }
            if (d->type->no_value) {
                if (d->type->keys_are_odd && !d->ht_table[1][h]) {
                    /* Destination bucket is empty and we can store the key
                     * directly without an allocated entry. Free the old entry
                     * if it's an allocated entry.
                     *
                     * TODO: Add a flag 'keys_are_even' and if set, we can use
                     * this optimization for these dicts too. We can set the LSB
                     * bit when stored as a dict entry and clear it again when
                     * we need the key back. */
                    assert(entryIsKey(key));
                    if (!entryIsKey(de)) zfree(decodeMaskedPtr(de));
                    de = key;
                } else if (entryIsKey(de)) {
                    /* We don't have an allocated entry but we need one. */
                    de = createEntryNoValue(key, d->ht_table[1][h]);
                } else {
                    /* Just move the existing entry to the destination table and
                     * update the 'next' field. */
                    assert(entryIsNoValue(de));
                    dictSetNext(de, d->ht_table[1][h]);
                }
            } else {
                dictSetNext(de, d->ht_table[1][h]);
            }
            d->ht_table[1][h] = de;
            d->ht_used[0]--;
            d->ht_used[1]++;
            de = nextde;
        }
        d->ht_table[0][d->rehashidx] = NULL;
        d->rehashidx++;
    }

    /* Check if we already rehashed the whole table... */
    if (d->ht_used[0] == 0) {
        zfree(d->ht_table[0]);
        /* Copy the new ht onto the old one */
        d->ht_table[0] = d->ht_table[1];
        d->ht_used[0] = d->ht_used[1];
        d->ht_size_exp[0] = d->ht_size_exp[1];
        _dictReset(d, 1);
        d->rehashidx = -1;
        return 0;
    }

    /* More to rehash... */
    return 1;
}

ZipList

ziplist 通过将多个元素存储在一段连续的内存空间中来节省内存，同时也减少了内存碎片化的问题。ziplist 主要用于存储较小的列表或哈希表，它的设计目标是在内存占用和性能之间取得平衡。不过，ziplist 也有一些限制，包括不能直接删除或插入元素，因为要维护整个列表的内存连续，需要进行整个列表的重建；在存储大型数据时，ziplist 可能就不太适合，申请大片的连续内存空间其实比较困难，所以不能添加过多的数据。

ziplist 的内存结构

<zlbytes> <zltail> <zllen> <entry> <entry> ... <entry> <zlend>

zlbytes: 总字节数
zltail: 尾节点偏移量
zllen: 列表长度
zlend: 结束标识

ziplist 不使用结构体的主要原因是为了节省内存空间和提高访问效率。

节省内存空间：在 C 语言中，结构体（struct）的内存布局是连续的，但并不是绝对的连续内存块。使用结构体会引入额外的指针或元数据，增加每个元素的存储空间。而 ziplist 的设计是为了尽量紧凑地存储元素，将多个元素存储在连续的内存空间中，因此不采用结构体可以节省内存空间。
提高访问效率：由于元素存储在连续的内存空间中，不采用结构体可以简化访问和遍历元素的过程，提高访问效率。使用结构体会引入额外的指针操作，可能会降低访问效率。

新建一个 ziplist 会根据上面的结构创建。

/* Create a new empty ziplist. */
unsigned char *ziplistNew(void) {
    unsigned int bytes = ZIPLIST_HEADER_SIZE+ZIPLIST_END_SIZE;
    unsigned char *zl = zmalloc(bytes);
    ZIPLIST_BYTES(zl) = intrev32ifbe(bytes);
    ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(ZIPLIST_HEADER_SIZE);
    ZIPLIST_LENGTH(zl) = 0;
    zl[bytes-1] = ZIP_END;
    return zl;
}

ziplist entries 的内存结构

可以实现双端遍历。

缺点：

连锁更新问题(__ziplistCascadeUpdate)，如果有 N 个连续长度在边界的 entry，比如长度为 250~253 长度的 entry，在其中一个更新后，后面的所有的 entry 都要进行扩容更新。

<prevlen> <encoding> <entry-data>

prevlen: 前一个元素的长度
- 前一个 entry 的长度为 0-253，<prevlen from 0 to 253> <encoding> <entry>
- 如果大于 253，0xFE <4 bytes unsigned little endian prevlen> <encoding> <entry>
encoding: 编码，前面 2bit 用来记录数据类型，后面的 xbit 用来记录数据长度。
- |00pppppp| - 1 byte （数字类型）
- |01pppppp|qqqqqqqq| - 2 bytes （14 bits 以大端存储长度数据）
- |10000000|qqqqqqqq|rrrrrrrr|ssssssss|tttttttt| - 5 bytes （4 bytes 存储长度数据）
entry-data: 数据

QuickList

Quicklist 是一种优化的链表数据结构，用于存储列表类型的数据。如果 ZipList 的 entry 较多的时候，做增删操作，需要申请一大块内存，会导致内存效率很低。Quicklist 主要用于解决 ZipList 在处理大量元素时效率低的问题，通过限制 ZipList 的长度来优化性能，通过创建多个 ZipList 分片来存储数据解决长度问题。

typedef struct quicklistNode {
    struct quicklistNode *prev;
    struct quicklistNode *next;
    unsigned char *entry;
    size_t sz;             /* entry size in bytes */
    unsigned int count : 16;     /* count of items in listpack */
    unsigned int encoding : 2;   /* RAW==1 or LZF==2 */
    unsigned int container : 2;  /* PLAIN==1 or PACKED==2 */
    unsigned int recompress : 1; /* was this node previous compressed? */
    unsigned int attempted_compress : 1; /* node can't compress; too small */
    unsigned int dont_compress : 1; /* prevent compression of entry that will be used later */
    unsigned int extra : 9; /* more bits to steal for future usage */
} quicklistNode;

typedef struct quicklist {
    quicklistNode *head;
    quicklistNode *tail;
    unsigned long count;        /* total count of all entries in all listpacks */
    unsigned long len;          /* number of quicklistNodes */
    signed int fill : QL_FILL_BITS;       /* fill factor for individual nodes */
    unsigned int compress : QL_COMP_BITS; /* depth of end nodes not to compress;0=off */
    unsigned int bookmark_count: QL_BM_BITS;
    quicklistBookmark bookmarks[];
} quicklist;

quicklist
- head / tail: 头尾节点
- count: 所有 zipList 的 entry 数量
- len: zipList 的数量
- fill: 所有 zipList 的 entry 数量上限
- compress: 不压缩的节点数量
- bookmarks:
quicklistNode
- prev / next: 前后节点
- entry: zipList 的指针
- sz: ziplist 的字节大小
- count: ziplist 的 entry 数量
- encoding: ziplist 的编码方式 1: RAW 2: lzf 压缩
- container: 数据容器类型 1: PLAIN 2: PACKED
- recompress: 是否压缩
  - 对于头部和尾部的节点，通常是经常被访问的，保持这部分节点的大小不变可以提高访问效率，所以中间部分的 quicklist 会压缩以节省内存。

SkipList

SkipList 用于优化链表遍历，保存链表的多个节点，快速定位节点 index，节点含有多个指针，n + 1 级指针数量相当于指针的跨度不同，级别越大跨度越大，最多允许 32 级指针。

typedef struct zskiplistNode {
    sds ele;
    double score;
    struct zskiplistNode *backward;
    struct zskiplistLevel {
        struct zskiplistNode *forward;
        unsigned long span;
    } level[];
} zskiplistNode;

typedef struct zskiplist {
    struct zskiplistNode *header, *tail;
    unsigned long length;
    int level;
} zskiplist;

zskiplist
- header/tail：头尾节点
- length：长度
- level： zskiplist 等级
zskiplistNode
- ele：节点存储的值
- score：节点分数，排序，查找用
- backward：前一个节点指针
- level：多个索引数组
  - forward：下一个节点指针
  - span：索引跨度

RedisObject

每个redis数据都会使用RedisObject来封装。

struct redisObject {
    unsigned type:4;
    unsigned encoding:4;
    unsigned lru:LRU_BITS; /* LRU time (relative to global lru_clock) or
                            * LFU data (least significant 8 bits frequency
                            * and most significant 16 bits access time). */
    int refcount;
    void *ptr;
};

type: 类型
- 0 string
- 1 list
- 2 set
- 3 zset
- 4 hash
encoding: 底层编码

#OBJ_ENCODING_RAW 0     /* Raw representation */
#OBJ_ENCODING_INT 1     /* Encoded as integer */
#OBJ_ENCODING_HT 2      /* Encoded as hash table */
#OBJ_ENCODING_ZIPMAP 3  /* No longer used: old hash encoding. */
#OBJ_ENCODING_LINKEDLIST 4 /* No longer used: old list encoding. */
#OBJ_ENCODING_ZIPLIST 5 /* No longer used: old list/hash/zset encoding. */
#OBJ_ENCODING_INTSET 6  /* Encoded as intset */
#OBJ_ENCODING_SKIPLIST 7  /* Encoded as skiplist */
#OBJ_ENCODING_EMBSTR 8  /* Embedded sds string encoding */
#OBJ_ENCODING_QUICKLIST 9 /* Encoded as linked list of listpacks */
#OBJ_ENCODING_STREAM 10 /* Encoded as a radix tree of listpacks */
#OBJ_ENCODING_LISTPACK 11 /* Encoded as a listpack */

lru: 最后一次被访问的时间，用于内存回收
refcount: 引用计数器
ptr: 指针，保存真实数据

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

internal_struct.md

internal_struct.md

redis 内部数据结构

sds

扩容

intset

扩容

dict

扩容

缩容

rehash

ZipList

ziplist 的内存结构

ziplist entries 的内存结构

QuickList

SkipList

RedisObject

Files

internal_struct.md

Latest commit

History

internal_struct.md

File metadata and controls

redis 内部数据结构

sds

扩容

intset

扩容

dict

扩容

缩容

rehash

ZipList

ziplist 的内存结构

ziplist entries 的内存结构

QuickList

SkipList

RedisObject