当前位置:网站首页>Redis源码分析之HSET流程与ziplist

Redis源码分析之HSET流程与ziplist

2022-04-23 14:11:00 Mrpre

本篇分析Redis 如何处理hset指令

数据结构

首先,hset不同于setset是存储在dict,即redis字典里面,毕竟set只是简单的kv类型数据。 hset存储的是复杂的数据结构,使用了压缩表ziplist(超大数据使用hashmap,本篇不涉及),例如

hset mykey name abc id 123

其中 mykey 为 key,abc为field,123为 value 。首先,需要说明的是,如果想了解ziplist内存的布局,完全可以看ziplist.c 文件开头的那些注释,这个是非常官方的解释,也非常详细。

  • ziplist 其实挺坑的,其查找时间复杂度是O(n),如果是hmget n个field,那么复杂度就是O(n^2)
  • hash_max_ziplist_entries 可以控制超过多少个field或者value后,转换成hashtable。hash_max_ziplist_value 可以控制 field或者value超过多大后,转换成hashtable
  • 但是,设置 通过congfig set 来设置hash_max_ziplist_entries/hash_max_ziplist_value 不能立刻生效!!!!,除非 设置这些配置过后,key被hset了一下,那么才尝试去转换成hashtable,或者 通过redis.conf 里面配死,然后 bgsave/save生成rdb文件后,重新启动解析rdb时,会将之前的ziplist转换为hashtable。如果你的业务在设置过后一直是hget,那么一直都在O(n)
  • 对于少量field后者value,使用 ziplist是有意义的;对于巨长field或者value,ziplist也节省不了空间。
  • 不要尝试改动这2个值,如果你对自己存储业的数据没有百分之百了解的情况下,改动这2个参数,意味着潜在的雪崩。

使用场景:存储序列化的kv。比如,一个人叫张三,他属性有年龄、性别、职业等,使用者往往使用struct/class 存储这些属性:

struct  people {
int 年龄
string 性别
} 张三

那么有几种方法存储这些信息到kv里面
(1)自己序列化,例如key为 张三,value为年龄+性别等信息,然后set到redis;取出来时自己反序列化。
(2)不同属性设置不同key,例如(张三_年龄,18),(张三_性别,男)。。。这么一堆key,分别set,然后使用时分别get。
(3)使用hset,这就体现hset的优势了。

名为mykey的key,里面有2个条目,分别是 name 以及 id。如果让你设计如何存储这些数据,怎么设计?最简单的就是将key和filed合并,即 {mykey_name, abc}, {mykey_id, 123} 2个kv,然后分别set。当然redis并不是这么存储的,我们先来看下redis怎么存储的,最后再来对比和set指令的区别。

hset对应的server的执行函数是hsetCommand

void hsetCommand(client *c) {
    
    int i, created = 0;
    robj *o;

	/*hset 指令 都是 field value 对,故正确指令的参数是2的倍数*/
    if ((c->argc % 2) == 1) {
    
        addReplyError(c,"wrong number of arguments for HMSET");
        return;
    }

	/*找到 key 对应 的object,没找到就 创建 * 假设命令是 hset mykey name abc id 123,则 查找 mykey * */
    if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
	/*后面再说*/
    hashTypeTryConversion(o,c->argv,2,c->argc-1);

	/*保存 field value*/
    for (i = 2; i < c->argc; i += 2)
        created += !hashTypeSet(o,c->argv[i]->ptr,c->argv[i+1]->ptr,HASH_SET_COPY);

    /* HMSET (deprecated) and HSET return value is different. */
    char *cmdname = c->argv[0]->ptr;
    if (cmdname[1] == 's' || cmdname[1] == 'S') {
    
        /* HSET */
        addReplyLongLong(c, created);
    } else {
    
        /* HMSET */
        addReply(c, shared.ok);
    }
    signalModifiedKey(c->db,c->argv[1]);
    notifyKeyspaceEvent(NOTIFY_HASH,"hset",c->argv[1],c->db->id);
    server.dirty++;
}

所以保存 field value的核心函数是 hashTypeSet。在看 hashTypeSet先看看hashTypeLookupWriteOrCreate函数,这个毕竟,知道如何创建ziplist就大致了解如何存储数据到ziplist

hashTypeLookupWriteOrCreate调用了createHashObject创建obj,然后调用ziplistNew创建ziplist挂在obj上。

/* Create a new empty ziplist. */
unsigned char *ziplistNew(void) {
    
    unsigned int bytes = ZIPLIST_HEADER_SIZE+ZIPLIST_END_SIZE;
    unsigned char *zl = zmalloc(bytes);
    ZIPLIST_BYTES(zl) = intrev32ifbe(bytes);
    ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(ZIPLIST_HEADER_SIZE);
    ZIPLIST_LENGTH(zl) = 0;
    zl[bytes-1] = ZIP_END;
    return zl;
}

ziplistNew顾名思义,创建空的ziplist,空的ziplist,初始大小是ZIPLIST_HEADER_SIZE+ZIPLIST_END_SIZE

/* The size of a ziplist header: two 32 bit integers for the total * bytes count and last item offset. One 16 bit integer for the number * of items field. */
#define ZIPLIST_HEADER_SIZE (sizeof(uint32_t)*2+sizeof(uint16_t))

/* Size of the "end of ziplist" entry. Just one byte. */
#define ZIPLIST_END_SIZE (sizeof(uint8_t))

显然,初始化的字段,都是一些头部字段以及边界字段

|   total bytes(32bit)  |  last item offset(32bit)   |  number of items (16bit)  |  end (8bit)|

ZIPLIST_BYTES宏是用来取当前ziplist总长度的,包括头部长度以及entry的总长度,也就是 第一个32bit的 total bytes。
ZIPLIST_TAIL_OFFSET是用来取最后一个entry位置的,也就是last item offset所保存的值。

但是这是空的ziplist,如果保存field value那么数据结构又是怎么样的,这里先展示entry在整个ziplist中的布局:

|   total bytes  |  last item offset   |  number of items |  entry1 | entry2 | end |

entry 放置在 number of items之后,end之前。

下面我们看看如何存储数据到 ziplist,然后怎么创建entry的。

#define HASH_SET_TAKE_FIELD (1<<0)
#define HASH_SET_TAKE_VALUE (1<<1)
#define HASH_SET_COPY 0
int hashTypeSet(robj *o, sds field, sds value, int flags) {
    
    int update = 0;

	/*默认就是 OBJ_ENCODING_ZIPLIST ,所以进这个if*/
    if (o->encoding == OBJ_ENCODING_ZIPLIST) {
    
        unsigned char *zl, *fptr, *vptr;

		/*zl 是在 hashTypeLookupWriteOrCreate时,通过 ziplistNew 创建*/
        zl = o->ptr;
		/*ziplistIndex 获取 第ZIPLIST_HEAD个entry *显然如果之前没有hset过,那么 fptr返回是null */
        fptr = ziplistIndex(zl, ZIPLIST_HEAD);
        if (fptr != NULL) {
    
        	/*从fptr所指的entry开始找,找到 field一样的entry *由于 fptr是 第 ZIPLIST_HEAD 个节点,即第0个节点 *这就是相当于从第0个entry开始找,时间复杂度是O(n) */
            fptr = ziplistFind(fptr, (unsigned char*)field, sdslen(field), 1);
            if (fptr != NULL) {
    
            	/*表示找到了,那么就覆盖 value*/
                /* Grab pointer to the value (fptr points to the field) */
                vptr = ziplistNext(zl, fptr);
                serverAssert(vptr != NULL);
                update = 1;

                /* Delete value */
                zl = ziplistDelete(zl, &vptr);

                /* Insert new value */
                zl = ziplistInsert(zl, vptr, (unsigned char*)value,
                        sdslen(value));
            }
        }

        if (!update) {
    
            /* Push new field/value pair onto the tail of the ziplist */
            /*没找到field对应的entry,那么就插入一个 *但是我们看到 要插入2次,field和value分别作为一个entry插入 * */
            zl = ziplistPush(zl, (unsigned char*)field, sdslen(field),
                    ZIPLIST_TAIL);
            zl = ziplistPush(zl, (unsigned char*)value, sdslen(value),
                    ZIPLIST_TAIL);
        }
        o->ptr = zl;

        /* Check if the ziplist needs to be converted to a hash table */
        if (hashTypeLength(o) > server.hash_max_ziplist_entries)
            hashTypeConvert(o, OBJ_ENCODING_HT);
    } else if (o->encoding == OBJ_ENCODING_HT) {
    
		......
    } else {
    
		......
    }

    /* Free SDS strings we did not referenced elsewhere if the flags * want this function to be responsible. */
    if (flags & HASH_SET_TAKE_FIELD && field) sdsfree(field);
    if (flags & HASH_SET_TAKE_VALUE && value) sdsfree(value);
    return update;
}

梳理下流程:
(1)找到第一个entry,然后从第一个entry开始找和filed相同的entry
(2)若(1)没找到,则创建2个entry,若(1)找到了则更新value

这里就要讲下 entry 的内存分布了

| pre_len | encoding | content |  

pre_len:记录了前一个entry的长度,因为 entry 是线性布局的,所以当我们知道前一个entry的长度时,我们就能在当前entry起始地址减去这个长度,就能寻址到前一个entry,达到反向遍历的效果。 pre_len本身的存储是被编码的,而不是简单的x字节。
pre_len的表示长度,要么1字节要么5字节,既简单也暴力。pre_len存储的值小于254,就用1字节表示,大于等于254就用5字节。当pre_len的msb为0x00时,当前字节就表示了长度,如果是0xfe时,那么其后4个字节,就是表示了长度。

encode:记录了content的类型和长度,他本身也是被编码的,而不是简单的x字节。 encode的编码就复杂了点。这里就直接贴出 源码中的注释,核心就是不同的msb (2bit)表示不同长度的数据。最高如果是11,那么他更特殊,因为后面保存的是数字。

* |00pppppp| - 1 byte
 *      String value with length less than or equal to 63 bytes (6 bits).
 *      "pppppp" represents the unsigned 6 bit length.
 * |01pppppp|qqqqqqqq| - 2 bytes
 *      String value with length less than or equal to 16383 bytes (14 bits).
 *      IMPORTANT: The 14 bit number is stored in big endian.
 * |10000000|qqqqqqqq|rrrrrrrr|ssssssss|tttttttt| - 5 bytes
 *      String value with length greater than or equal to 16384 bytes.
 *      Only the 4 bytes following the first byte represents the length
 *      up to 32^2-1. The 6 lower bits of the first byte are not used and
 *      are set to zero.
 *      IMPORTANT: The 32 bit number is stored in big endian.
 * |11000000| - 3 bytes
 *      Integer encoded as int16_t (2 bytes).
 * |11010000| - 5 bytes
 *      Integer encoded as int32_t (4 bytes).
 * |11100000| - 9 bytes
 *      Integer encoded as int64_t (8 bytes).
 * |11110000| - 4 bytes
 *      Integer encoded as 24 bit signed (3 bytes).
 * |11111110| - 2 bytes
 *      Integer encoded as 8 bit signed (1 byte).
 * |1111xxxx| - (with xxxx between 0000 and 1101) immediate 4 bit integer.
 *      Unsigned integer from 0 to 12. The encoded value is actually from
 *      1 to 13 because 0000 and 1111 can not be used, so 1 should be
 *      subtracted from the encoded 4 bit value to obtain the right value.
 * |11111111| - End of ziplist special entry.

content:就是实际的内容了。

此时不得不重提ziplist内存布局了,entry位于number of items之后,end
之前,ziplist作为一个线性空间,插入filed会必然导致频繁的realloc,即当频繁对一个key增加成千上百个field value时,就存在大量realloc以及内存拷贝

|   total bytes(32bit)  |  last item offset(32bit)   |  number of items (16bit)  |  end (8bit)|

我们来看看一个entry如何表示field的,插入函数是ziplistPush
ziplistPush->__ziplistInsert,下面代码中的注释就是我们关心的细节。

/* Insert item at "p". */
unsigned char *__ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen) {
    
    size_t curlen = intrev32ifbe(ZIPLIST_BYTES(zl)), reqlen;
    unsigned int prevlensize, prevlen = 0;
    size_t offset;
    int nextdiff = 0;
    unsigned char encoding = 0;
    long long value = 123456789; /* initialized to avoid warning. Using a value that is easy to see if for some reason we use it uninitialized. */
    zlentry tail;

	/*入参p是ZIPLIST_ENTRY_END(zl),即最后一个entry的结尾,所以p[0]在当前这种情况,必然是 ZIP_END *只有当 p 是 ZIPLIST_ENTRY_HEAD时p[0]才不会为ZIP_END */
    /* Find out prevlen for the entry that is inserted. */
    if (p[0] != ZIP_END) {
    
        ZIP_DECODE_PREVLEN(p, prevlensize, prevlen);
    } else {
    
    	/*到这里,p[0] == ZIP_END, 说明p指向末尾了,那么只能取最后一个entry*/
        unsigned char *ptail = ZIPLIST_ENTRY_TAIL(zl);
        if (ptail[0] != ZIP_END) {
    
        	/*到这里,说明当前有entry,那么就取最后一个entry*/
            prevlen = zipRawEntryLength(ptail);
        }/*else 说明这个zl 压根就是空的*/
        
    }

    /* See if the entry can be encoded */
    /*编码,看看字符串的值是不是数字,是数字就存储为数字,这样省内存 *编码成功,reqlen 就能设置为 具体的长度了,否则设置为原始长度slen */
    if (zipTryEncoding(s,slen,&value,&encoding)) {
    
        /* 'encoding' is set to the appropriate integer encoding */
        reqlen = zipIntSize(encoding);
    } else {
    
        /* 'encoding' is untouched, however zipStoreEntryEncoding will use the * string length to figure out how to encode it. */
        reqlen = slen;
    }
    /* We need space for both the length of the previous entry and * the length of the payload. */
    reqlen += zipStorePrevEntryLength(NULL,prevlen);
    reqlen += zipStoreEntryEncoding(NULL,encoding,slen);

	/*到这里,reqlen就是实际entry的长度,具体 编码 看上面2个函数*/
	
    /* When the insert position is not equal to the tail, we need to * make sure that the next entry can hold this entry's length in * its prevlen field. */
    int forcelarge = 0;
    /*上面注释写的很清楚了,由于下一个entry存折当前entry的长度,而长度本身又是编码的不定长字节,如果当前entry改变了长度,导致下一个entry保存pre_len字节数有变化,就是nextdiff*/
    nextdiff = (p[0] != ZIP_END) ? zipPrevLenByteDiff(p,reqlen) : 0;
    if (nextdiff == -4 && reqlen < 4) {
    
        nextdiff = 0;
        forcelarge = 1;
    }

    /* Store offset because a realloc may change the address of zl. */
    offset = p-zl;
    zl = ziplistResize(zl,curlen+reqlen+nextdiff);
    p = zl+offset;

    /* Apply memory move when necessary and update tail offset. */
    /*上面对ziplist realloc,即ziplist后面多了一堆空内存,这里如果p不是插入在最后而是插入在中间,那么需要把p对应的地方的数据,往后面挪,然后空出空间来保存当前的entry*/
    if (p[0] != ZIP_END) {
    
        /* Subtract one because of the ZIP_END bytes */
        memmove(p+reqlen,p-nextdiff,curlen-offset-1+nextdiff);

        /* Encode this entry's raw length in the next entry. */
        if (forcelarge)
            zipStorePrevEntryLengthLarge(p+reqlen,reqlen);
        else
            zipStorePrevEntryLength(p+reqlen,reqlen);

        /* Update offset for tail */
        ZIPLIST_TAIL_OFFSET(zl) =
            intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+reqlen);

        /* When the tail contains more than one entry, we need to take * "nextdiff" in account as well. Otherwise, a change in the * size of prevlen doesn't have an effect on the *tail* offset. */
        zipEntry(p+reqlen, &tail);
        if (p[reqlen+tail.headersize+tail.len] != ZIP_END) {
    
            ZIPLIST_TAIL_OFFSET(zl) =
                intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+nextdiff);
        }
    } else {
    
        /* This element will be the new tail. */
        ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(p-zl);
    }

    /* When nextdiff != 0, the raw length of the next entry has changed, so * we need to cascade the update throughout the ziplist */
    if (nextdiff != 0) {
    
    	/*这就是所谓的连锁更新,碰到这种情况就糟糕的很*/
        offset = p-zl;
        zl = __ziplistCascadeUpdate(zl,p+reqlen);
        p = zl+offset;
    }

    /* Write the entry */
    p += zipStorePrevEntryLength(p,prevlen);
    p += zipStoreEntryEncoding(p,encoding,slen);
    if (ZIP_IS_STR(encoding)) {
    
        memcpy(p,s,slen);
    } else {
    
        zipSaveInteger(p,value,encoding);
    }
    ZIPLIST_INCR_LENGTH(zl,1);
    return zl;
}

版权声明
本文为[Mrpre]所创,转载请带上原文链接,感谢
https://wonderful.blog.csdn.net/article/details/106824764