如何解决在Skylake x86-64上以可重新启动的顺序优化percpu 2级位向量
我很好奇如何才能最好地优化下面的程序集,尤其是代码块中“跳转到这里查看组件”下的部分(以便于进行control-f搜索)。
我正在编写一些代码,HOT HOT HOT路径基本上是在位向量中找到一个0位并返回该位。
位向量包含:
struct 2l_bitvec {
// outer vector with bits indicating with inner vectors have available slots
uint64_t v1;
// inner vector with actual index bits
uint64_t v2[64];
} 2l_bitvec;
每个cpu都有一个bitvec(或多个以很慢的路径数据结构链接在一起)。
要在这些位向量中管理一致性,我正在使用restartable sequences(向下滚动以获得我能找到的最佳联机帮助页)。
由于使用rseq
(这是超级热门代码),因此逻辑全部以内联汇编形式编写。
我要编写的C代码如下:
#define LIKELY(X) __builtin_expect(!!(X),1)
#define UNLIKELY(X) __builtin_expect((X),0)
uint64_t __attribute__((noinline))
restarting_l2_set_idx(uint64_t * v1,const uint32_t start_cpu) {
// if ever preempted,migrated,or catch a signal return here
catch_something_label:
if (start_cpu != __rseq_abi.cpu_id_start) {
return 4097;
}
uint64_t temp_v1 = *v1;
while (LIKELY(temp_v1 != (~(0UL)))) {
const uint32_t idx_v1 = _tzcnt_u64((~temp_v1));
uint64_t temp_v2 = v1[idx_v1 + 1];
if (LIKELY(temp_v2 != (~(0UL)))) {
const uint32_t idx = _tzcnt_u64(~temp_v2);
temp_v2 |= ((1UL) << idx);
v1[idx + 1] = temp_v2;
return 64 * idx_v1 + idx;
}
else {
temp_v1 |= ((1UL) << idx_v1);
*v1 = temp_v1;
}
}
return -1;
}
有些rseq
设置程序基本上是:
#define RSEQ_INFO_DEF(alignment) \
".pushsection __rseq_cs,\"aw\"\n\t" \
".balign " #alignment \
"\n\t" \
"3:\n\t" \
".long 0x0\n" \
".long 0x0\n" \
".quad 1f\n" \
".quad 2f - 1f\n" \
".quad 4f\n" \
".popsection\n\t"
/*
".pushsection __rseq_cs,\"aw\"\n\t" // creation section
".balign " #alignment"\n\t" // alignment at least 32
"3:\n\t" // struct info jump label
// struct is rseq_info
".long 0x0\n" // version = 0
".long 0x0\n" // flags = 0
".quad 1f\n" // start_ip = 1f (label 1,forward)
".quad 2f - 1f\n" // post_commit_offset = (start_cs
label - end_cs label)
".quad 4f\n" // abort label = 4f (label 4)
".popsection\n\t" // end section
*/
#define RSEQ_CS_ARR_DEF() \
".pushsection __rseq_cs_ptr_array,\"aw\"\n\t" \
".quad 3b\n\t" \
".popsection\n\t"
/*
".pushsection __rseq_cs_ptr_array,\"aw\"\n\t" // create ptr section
".quad 3b\n\t" // set ptr to addr of
rseq_info
".popsection\n\t" // end section
*/
#define RSEQ_PREP_CS_DEF(TEMP_REGISTER) \
"leaq 3b (%%rip)," V_TO_STR(TEMP_REGISTER) "\n\t" \
"movq " V_TO_STR(TEMP_REGISTER) ",%%fs:__rseq_abi@tpoff+8\n\t" \
/*
"leaq 3b (%%rip),REGISTER\n\t" // get set for rseq_info struct
"movq REGISTER,8(%[rseq_abi])\n\t" // store in ptr field in __rseq_abi
*/
#define RSEQ_CMP_CUR_VS_START_CPUS() \
"cmpl %[start_cpu],%%fs:__rseq_abi@tpoff+4\n\t"
/*
"cmpl %[start_cpu],4(%[rseq_abi])\n\t" // get cpu in 4(%[rseq_abi]) and
compare to %[start_cpu] which is
passed as param to function
*/
// sometimes this is better to put in the
// same code section as the critical section
#define RSEQ_START_ABORT_DEF() \
".pushsection __rseq_failure,\"ax\"\n\t" \
".byte 0x0f,0xb9,0x3d\n\t" \
".long 0x53053053\n\t" \
"4:\n\t" \
/*
".pushsection __rseq_failure,\"ax\"\n\t" // create failure section
".byte 0x0f,0x3d\n\t" // Disassembler-friendly signature:
ud1 <sig>(%rip),%edi
".long 0x53053053\n\t" // invalid operation to avoid code
injection
"4:\n\t" // abort label
*/
#define RSEQ_END_ABORT_DEF() ".popsection\n\t"
/*
".popsection\n\t" // end failure section
*/
在伪代码中,包含所有rseq
东西的程序集看起来是这样的:
/*
Type assembly will look like as follow:
foo(...,uint32_t start_cpu)
RSEQ_INFO_DEF(32)
RSEQ_CS_ARR_DEF()
RSEQ_PREP_CS_DEF()
// maybe some setup stuff (or maybe abort)
"1:\n\t"
RSEQ_CMP_CUR_VS_START_CPUS()
// handle migrated somehow
<actual critical section here>
"2:\n\t" (this is end label of critical section)
// if abort is in another code section
RSEQ_START_ABORT_DEF()
<logical for abort here>
// if this is goto generally jmp %l[abort]
// otherwise some actual logic (usually set return var)
RSEQ_END_ABORT_DEF()
: <output variables,only if NOT goto asm>
: <input variables> +
[ start_cpu ] "g"(start_cpu),// always
: <clobber registers> +
"memory","cc" // minimum clobbers
#ifdef IS_GOTO_ASM
: <jump labels OUTSIDE of the asm>
#endif
*/
针对VAST大多数中止是由于抢占而不是迁移的事实进行了优化,因此通常中止只会跳回到检查当前cpu并继续(因为比较成功)
我正在使用的汇编代码如下:
跳到这里查看大会
#define PRIMITIVE_V_TO_STR(X) #X
#define V_TO_STR(X) PRIMITIVE_V_TO_STR(X)
#define _FAILURE_MIGRATED 4097
// inlining the function often breaks stuff,so while testing I am skipping that
// aligning to cache line seems to actually affect performance significantly
uint64_t __attribute__((noinline))
__attribute__((aligned(64)))
restarting_2l_set_idx(uint64_t * const v1,const uint32_t start_cpu) {
// return [0 - 4095] -> success (that is the index)
// return [4097] -> failure the thread migrated
// return [-1] -> failure the bit vector is full
#pragma GCC diagnostic ignored "-Wuninitialized"
// pin for return so compiler doesnt fuck up
register uint64_t idx asm("rax");
// some temps I trust the compiler to allocate smartly
uint64_t * v2;
uint64_t idx_v1,temp_v1,temp_v2;
#pragma GCC diagnostic push
// clang-format off
asm volatile(
RSEQ_INFO_DEF(32)
RSEQ_CS_ARR_DEF()
// any register will do
RSEQ_PREP_CS_DEF(%[temp_v1])
"mov $" V_TO_STR(_FAILURE_MIGRATED) ",%[idx]\n\t"
#ifdef FAST_ABORT
// skip abort first time
"jmp 1f\n\t"
".byte 0x0f,0x3d\n\t" // Disassembler-friendly signature: ud1 <sig>(%rip),%edi
".long 0x53053053\n\t" // invalid operation to avoid code injection
"4:\n\t" // abort label
".byte 0x0f,0x3d\n\t"
".long 0x53053053\n\t"
"4:\n\t"
"mov $" V_TO_STR(_FAILURE_MIGRATED) ",%[idx]\n\t"
#endif
// start critical section
"1:\n\t"
// check if migrated
RSEQ_CMP_CUR_VS_START_CPUS()
// if migrated goto 2:
"jnz 2f\n\t"
// if not migrated temp_v = *v
"movq (%[v1]),%[temp_v1]\n\t"
// start loop: while(temp_v1 != -1)
"5:\n\t"
// idx = ~temp_v
"movq %[temp_v1],%[idx]\n\t"
// The reason we can't do this cmp after notq %[idx]
// (and use testq) is because
// 0 is a valid idx to return whereas -1 is not
// (also why setting idx before the comparison)
// if (%[v1]) is full leave.
// This branch is VERY unexpected.
"cmpq $-1,%[idx]\n\t"
"jz 2f\n\t"
"notq %[idx]\n\t"
// idx_v1 = tzcnt(idx) (find first one)
"tzcntq %[idx],%[idx_v1]\n\t"
// if registers are tight v2 could be in
// memory and could use [idx] as a temporary
// temp_v2 = v[idx_v1 + 1]
"leaq 8(%[v1],%[idx_v1],8),%[v2]\n\t"
"movq (%[v2]),%[temp_v2]\n\t"
// test if temp_v2 is full
"cmpq $-1,%[temp_v2]\n\t"
"jz 7f\n\t" // 7f is btsq %[idx_outer],%[temp_v1],jmp 5b
// idx = ~temp_v2
"movq %[temp_v2],%[idx]\n\t"
"notq %[idx]\n\t"
// could replace the cmpq $-1,%[temp_v2],jz above with
// testq %[idx],%[idx],jz here
// idx = tzcnt(idx)
"tzcntq %[idx],%[idx]\n\t"
// temp_v2 |= 1 << idx
"btsq %[idx],%[temp_v2]\n\t"
"jmp 9f\n\t"
"7:\n\t"
"btsq %[idx_v1],%[temp_v1]\n\t"
// this is a completely valid state to be migrated out after
// (all we have really done is cleaned up v1 vector a bit)
// because we can be migrated out here we don't check/set if
// temp_v2 is full as that could lead to invalid state in v1
"movq %[temp_v1],(%[v1])\n\t"
// this is } in while loop starting at 5:
"jmp 5b\n\t"
// prepare for commit and commit
"9:\n\t"
// temp_v2 |= 1UL << idx
"btsq %[idx],%[temp_v2]\n\t"
// prepare success return
"salq $6,%[idx_v1]\n\t"
"addq %[idx_v1],%[idx]\n\t"
// commit
"movq %[temp_v2],(%[v2])\n\t"
// end critical section
"2:\n\t"
#ifndef FAST_ABORT
RSEQ_START_ABORT_DEF()
// given that the critical section is fairly involved
// it may be worth it to put this in the same code section
// as critical section for faster aborts
"mov $" V_TO_STR(_FAILURE_MIGRATED) ",%[idx]\n\t"
"jmp 1b\n\t"
RSEQ_END_ABORT_DEF()
#endif
: [ idx] "+r" (idx)
: [ idx_v1 ] "r" (idx_v1),[ temp_v2 ] "r" (temp_v2),[ temp_v1 ] "r" (temp_v1),[ v2 ] "r" (v2),[ v1 ] "g" (v1),[ start_cpu] "g" (start_cpu)
: "memory","cc");
return idx;
}
正确之后,我的第一个,第二个和第三个目标就是要快速。所有优化都必须考虑到在执行任何指令后代码都可以跳转到异常终止状态(因此,为什么从temp_v2
到v2
的提交是关键部分的最后一条指令)。而且如果中止是由于线程迁移导致的,则该函数无法写入任何数据(否则将出现严重的竞争状况)。
如果要在用户空间中运行/编译此文件,则需要包含linux/rseq.h
标头。设置的一个不错的“ hello world”是here和/或位于librseq。
注意:我将其发布在这里,而不是在codereview.SE上发布,因为我的主要问题是如何更快地完成关键部分restarting_l2_set_idx
中的程序集。
编辑: @PeterCordes
建议在此处替换leaq:
"leaq 8(%[v1],%[temp_v2]\n\t"
我将其更改为此
"movq %[v1],%[v2]\n\t" // v2 = v1
"salq $3,%[idx_v1]\n\t" // idx_v1 = 8 * idx_v1
"addq %[idx_v1],%[v2]\n\t" // v2 += idx_v1 (index by uint64_t)
"movq 8(%[v2]),%[temp_v2]\n\t" // temp_v2 = *(v + 8)
由于idx_v1
现在具有8 x的位位置,因此它代表以下代码:
// in 7: label
"btsq %[idx_v1],%[temp_v1]\n\t"
到
"sarq $3,%[idx_v1]\n\t"
"btsq %[idx_v1],%[temp_v1]\n\t"
和
// in 9: label
"salq $6,%[idx_v1]\n\t"
到
"salq $3,%[idx_v1]\n\t"
但是,我不确定这是否实际上是一种性能改进。我认为确实需要为提交存储v2
来压抑这一事实。
Edit2:
@PeterCordes指出我的编辑很愚蠢:我可以删除v2
临时文件
和使用
movq 8(%[v1],%[temp_v2]
得到temp_v2
并
movq %[temp_v2],8(%[v1],8)
进行存储。对不起,我天真的第一次编辑:(
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。