在Skylake x86-64上以可重新启动的顺序优化percpu 2级位向量

如何解决在Skylake x86-64上以可重新启动的顺序优化percpu 2级位向量

我很好奇如何才能最好地优化下面的程序集,尤其是代码块中“跳转到这里查看组件”下的部分(以便于进行control-f搜索)。


我正在编写一些代码,HOT HOT HOT路径基本上是在位向量中找到一个0位并返回该位。

位向量包含:

struct 2l_bitvec {
       // outer vector with bits indicating with inner vectors have available slots
       uint64_t v1;

       // inner vector with actual index bits
       uint64_t v2[64];
} 2l_bitvec;

每个cpu都有一个bitvec(或多个以很慢的路径数据结构链接在一起)。

要在这些位向量中管理一致性,我正在使用restartable sequences(向下滚动以获得我能找到的最佳联机帮助页)。

由于使用rseq(这是超级热门代码),因此逻辑全部以内联汇编形式编写。

我要编写的C代码如下:

#define LIKELY(X)   __builtin_expect(!!(X),1)
#define UNLIKELY(X) __builtin_expect((X),0)
uint64_t __attribute__((noinline))
restarting_l2_set_idx(uint64_t * v1,const uint32_t start_cpu) {
    
// if ever preempted,migrated,or catch a signal return here
catch_something_label:
    
    if (start_cpu != __rseq_abi.cpu_id_start) {
        return 4097;
    }

    uint64_t temp_v1 = *v1;
    while (LIKELY(temp_v1 != (~(0UL)))) {
        const uint32_t idx_v1  = _tzcnt_u64((~temp_v1));
        
        uint64_t       temp_v2 = v1[idx_v1 + 1];
        if (LIKELY(temp_v2 != (~(0UL)))) {
            const uint32_t idx = _tzcnt_u64(~temp_v2);
            
            temp_v2 |= ((1UL) << idx);
            v1[idx + 1] = temp_v2;
            
            return 64 * idx_v1 + idx;
        }
        else {
            temp_v1 |= ((1UL) << idx_v1);
            *v1 = temp_v1;
        }
    }
    
    return -1;
}

有些rseq设置程序基本上是:

#define RSEQ_INFO_DEF(alignment)                                               \
    ".pushsection __rseq_cs,\"aw\"\n\t"                                       \
    ".balign " #alignment                                                      \
    "\n\t"                                                                     \
    "3:\n\t"                                                                   \
    ".long 0x0\n"                                                              \
    ".long 0x0\n"                                                              \
    ".quad 1f\n"                                                               \
    ".quad 2f - 1f\n"                                                          \
    ".quad 4f\n"                                                               \
    ".popsection\n\t"

/*
    ".pushsection __rseq_cs,\"aw\"\n\t"    // creation section
    ".balign " #alignment"\n\t"             // alignment at least 32
    "3:\n\t"                                // struct info jump label
                                            // struct is rseq_info
    ".long 0x0\n"                           // version = 0
    ".long 0x0\n"                           // flags = 0
    ".quad 1f\n"                            // start_ip = 1f (label 1,forward)
    ".quad 2f - 1f\n"                       // post_commit_offset = (start_cs
                                               label - end_cs label)
    ".quad 4f\n"                            // abort label = 4f (label 4)
    ".popsection\n\t"                       // end section
*/


#define RSEQ_CS_ARR_DEF()                                                      \
    ".pushsection __rseq_cs_ptr_array,\"aw\"\n\t"                             \
    ".quad 3b\n\t"                                                             \
    ".popsection\n\t"

/*
    ".pushsection __rseq_cs_ptr_array,\"aw\"\n\t"  // create ptr section
    ".quad 3b\n\t"                                  // set ptr to addr of
                                                       rseq_info
    ".popsection\n\t"                               // end section
*/

#define RSEQ_PREP_CS_DEF(TEMP_REGISTER)                               \
    "leaq 3b (%%rip)," V_TO_STR(TEMP_REGISTER) "\n\t"                         \
    "movq " V_TO_STR(TEMP_REGISTER) ",%%fs:__rseq_abi@tpoff+8\n\t"          \



/*
    "leaq 3b (%%rip),REGISTER\n\t"     // get set for rseq_info struct
    "movq REGISTER,8(%[rseq_abi])\n\t" // store in ptr field in __rseq_abi
*/

#define RSEQ_CMP_CUR_VS_START_CPUS()                                           \
    "cmpl %[start_cpu],%%fs:__rseq_abi@tpoff+4\n\t"

/*
    "cmpl %[start_cpu],4(%[rseq_abi])\n\t" // get cpu in 4(%[rseq_abi]) and
                                               compare to %[start_cpu] which is
                                               passed as param to function
*/


// sometimes this is better to put in the
// same code section as the critical section
#define RSEQ_START_ABORT_DEF()                                                 \
    ".pushsection __rseq_failure,\"ax\"\n\t"                                  \
    ".byte 0x0f,0xb9,0x3d\n\t"                                               \
    ".long 0x53053053\n\t"                                                     \
    "4:\n\t"                                                                   \

/*
  ".pushsection __rseq_failure,\"ax\"\n\t" // create failure section
    ".byte 0x0f,0x3d\n\t"            // Disassembler-friendly signature:
                                               ud1 <sig>(%rip),%edi
    ".long 0x53053053\n\t"                  // invalid operation to avoid code
                                               injection 
    "4:\n\t"                                // abort label
*/

#define RSEQ_END_ABORT_DEF() ".popsection\n\t"

/*
    ".popsection\n\t"   // end failure section
*/

在伪代码中,包含所有rseq东西的程序集看起来是这样的:

/*
Type assembly will look like as follow:
foo(...,uint32_t start_cpu) 
    RSEQ_INFO_DEF(32) 
    RSEQ_CS_ARR_DEF() 
    RSEQ_PREP_CS_DEF()

    // maybe some setup stuff (or maybe abort)

    "1:\n\t"    

    RSEQ_CMP_CUR_VS_START_CPUS()
    // handle migrated somehow

    <actual critical section here>
    "2:\n\t" (this is end label of critical section)

    // if abort is in another code section
    RSEQ_START_ABORT_DEF()
    <logical for abort here>
        // if this is goto generally jmp %l[abort]
        // otherwise some actual logic (usually set return var)
    RSEQ_END_ABORT_DEF()
    : <output variables,only if NOT goto asm>
    : <input variables> +
     [ start_cpu ] "g"(start_cpu),// always
    : <clobber registers> +
      "memory","cc" // minimum clobbers
    #ifdef IS_GOTO_ASM
    : <jump labels OUTSIDE of the asm>
    #endif
*/

针对VAST大多数中止是由于抢占而不是迁移的事实进行了优化,因此通常中止只会跳回到检查当前cpu并继续(因为比较成功)

我正在使用的汇编代码如下:

跳到这里查看大会

#define PRIMITIVE_V_TO_STR(X) #X
#define V_TO_STR(X) PRIMITIVE_V_TO_STR(X)

#define _FAILURE_MIGRATED 4097

// inlining the function often breaks stuff,so while testing I am skipping that
// aligning to cache line seems to actually affect performance significantly

uint64_t __attribute__((noinline))
__attribute__((aligned(64)))
    restarting_2l_set_idx(uint64_t * const v1,const uint32_t start_cpu) {
    // return [0 - 4095] -> success (that is the index)
    // return [4097] -> failure the thread migrated
    // return [-1] -> failure the bit vector is full
    
#pragma GCC diagnostic ignored "-Wuninitialized"
    // pin for return so compiler doesnt fuck up
    register uint64_t idx asm("rax");

    // some temps I trust the compiler to allocate smartly
    uint64_t * v2;
    uint64_t idx_v1,temp_v1,temp_v2;
#pragma GCC diagnostic push

    // clang-format off
    asm volatile(
        RSEQ_INFO_DEF(32)
        RSEQ_CS_ARR_DEF()

        // any register will do
        RSEQ_PREP_CS_DEF(%[temp_v1])

        "mov $" V_TO_STR(_FAILURE_MIGRATED) ",%[idx]\n\t"

#ifdef FAST_ABORT
        // skip abort first time
        "jmp 1f\n\t"
        
        ".byte 0x0f,0x3d\n\t"            // Disassembler-friendly signature: ud1 <sig>(%rip),%edi
        ".long 0x53053053\n\t"                  // invalid operation to avoid code injection 
        "4:\n\t"                                // abort label

        ".byte 0x0f,0x3d\n\t"
        ".long 0x53053053\n\t"
        "4:\n\t"
        "mov $" V_TO_STR(_FAILURE_MIGRATED) ",%[idx]\n\t"
#endif
        
        // start critical section
        "1:\n\t"
        
        // check if migrated        
        RSEQ_CMP_CUR_VS_START_CPUS()
        // if migrated goto 2:
        "jnz 2f\n\t"

        // if not migrated temp_v = *v
        "movq (%[v1]),%[temp_v1]\n\t"

        // start loop: while(temp_v1 != -1)
        "5:\n\t"
                
        // idx = ~temp_v
        "movq %[temp_v1],%[idx]\n\t"

                
        // The reason we can't do this cmp after notq %[idx]
        // (and use testq) is because
        // 0 is a valid idx to return whereas -1 is not
        // (also why setting idx before the comparison)

        // if (%[v1]) is full leave. 
        // This branch is VERY unexpected.
        "cmpq $-1,%[idx]\n\t"
        "jz 2f\n\t"
        
        "notq %[idx]\n\t"
        
        // idx_v1 = tzcnt(idx) (find first one)
        "tzcntq %[idx],%[idx_v1]\n\t"

        // if registers are tight v2 could be in
        // memory and could use [idx] as a temporary
        // temp_v2 = v[idx_v1 + 1]
        "leaq 8(%[v1],%[idx_v1],8),%[v2]\n\t"
        "movq (%[v2]),%[temp_v2]\n\t"

        // test if temp_v2 is full
        "cmpq $-1,%[temp_v2]\n\t"
        "jz 7f\n\t" // 7f is btsq %[idx_outer],%[temp_v1],jmp 5b
        
        // idx = ~temp_v2
        "movq %[temp_v2],%[idx]\n\t"
        "notq %[idx]\n\t"
        // could replace the cmpq $-1,%[temp_v2],jz above with
        // testq %[idx],%[idx],jz here

        // idx = tzcnt(idx)
        "tzcntq %[idx],%[idx]\n\t"

        // temp_v2 |= 1 << idx
        "btsq %[idx],%[temp_v2]\n\t"
        "jmp 9f\n\t"

        "7:\n\t"
        "btsq %[idx_v1],%[temp_v1]\n\t"
        
        // this is a completely valid state to be migrated out after
        // (all we have really done is cleaned up v1 vector a bit)
        // because we can be migrated out here we don't check/set if
        // temp_v2 is full as that could lead to invalid state in v1
        "movq %[temp_v1],(%[v1])\n\t"

        // this is } in while loop starting at 5:
        "jmp 5b\n\t"

        // prepare for commit and commit
        "9:\n\t"
        
        // temp_v2 |= 1UL << idx
        "btsq %[idx],%[temp_v2]\n\t"
               
        // prepare success return
        "salq $6,%[idx_v1]\n\t"
        "addq %[idx_v1],%[idx]\n\t"
        
        // commit
        "movq %[temp_v2],(%[v2])\n\t"

        // end critical section
        "2:\n\t"

#ifndef FAST_ABORT
          RSEQ_START_ABORT_DEF()
        // given that the critical section is fairly involved
        // it may be worth it to put this in the same code section
        // as critical section for faster aborts
        "mov $" V_TO_STR(_FAILURE_MIGRATED) ",%[idx]\n\t"
        "jmp 1b\n\t"
        RSEQ_END_ABORT_DEF()
#endif

        : [ idx] "+r" (idx)
        : [ idx_v1 ] "r" (idx_v1),[ temp_v2 ] "r" (temp_v2),[ temp_v1 ] "r" (temp_v1),[ v2 ] "r" (v2),[ v1 ] "g" (v1),[ start_cpu] "g" (start_cpu)
        : "memory","cc");

    return idx;
}

正确之后,我的第一个,第二个和第三个目标就是要快速。所有优化都必须考虑到在执行任何指令后代码都可以跳转到异常终止状态(因此,为什么从temp_v2v2的提交是关键部分的最后一条指令)。而且如果中止是由于线程迁移导致的,则该函数无法写入任何数据(否则将出现严重的竞争状况)。

如果要在用户空间中运行/编译此文件,则需要包含linux/rseq.h标头。设置的一个不错的“ hello world”是here和/或位于librseq

注意:我将其发布在这里,而不是在codereview.SE上发布,因为我的主要问题是如何更快地完成关键部分restarting_l2_set_idx中的程序集。

编辑: @PeterCordes

建议在此处替换leaq:

        "leaq 8(%[v1],%[temp_v2]\n\t"

我将其更改为此

        "movq %[v1],%[v2]\n\t"         // v2 = v1
        "salq $3,%[idx_v1]\n\t"        // idx_v1 = 8 * idx_v1
        "addq %[idx_v1],%[v2]\n\t"     // v2 += idx_v1 (index by uint64_t)
        "movq 8(%[v2]),%[temp_v2]\n\t" // temp_v2 = *(v + 8)

由于idx_v1现在具有8 x的位位置,因此它代表以下代码:

        // in 7: label
        "btsq %[idx_v1],%[temp_v1]\n\t"

        "sarq $3,%[idx_v1]\n\t"
        "btsq %[idx_v1],%[temp_v1]\n\t"

        // in 9: label
        "salq $6,%[idx_v1]\n\t"

        "salq $3,%[idx_v1]\n\t"

但是,我不确定这是否实际上是一种性能改进。我认为确实需要为提交存储v2来压抑这一事实。

Edit2: @PeterCordes指出我的编辑很愚蠢:我可以删除v2临时文件 和使用 movq 8(%[v1],%[temp_v2]得到temp_v2movq %[temp_v2],8(%[v1],8)进行存储。对不起,我天真的第一次编辑:(

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。

相关推荐


依赖报错 idea导入项目后依赖报错,解决方案:https://blog.csdn.net/weixin_42420249/article/details/81191861 依赖版本报错:更换其他版本 无法下载依赖可参考:https://blog.csdn.net/weixin_42628809/a
错误1:代码生成器依赖和mybatis依赖冲突 启动项目时报错如下 2021-12-03 13:33:33.927 ERROR 7228 [ main] o.s.b.d.LoggingFailureAnalysisReporter : *************************** APPL
错误1:gradle项目控制台输出为乱码 # 解决方案:https://blog.csdn.net/weixin_43501566/article/details/112482302 # 在gradle-wrapper.properties 添加以下内容 org.gradle.jvmargs=-Df
错误还原:在查询的过程中,传入的workType为0时,该条件不起作用 &lt;select id=&quot;xxx&quot;&gt; SELECT di.id, di.name, di.work_type, di.updated... &lt;where&gt; &lt;if test=&qu
报错如下,gcc版本太低 ^ server.c:5346:31: 错误:‘struct redisServer’没有名为‘server_cpulist’的成员 redisSetCpuAffinity(server.server_cpulist); ^ server.c: 在函数‘hasActiveC
解决方案1 1、改项目中.idea/workspace.xml配置文件,增加dynamic.classpath参数 2、搜索PropertiesComponent,添加如下 &lt;property name=&quot;dynamic.classpath&quot; value=&quot;tru
删除根组件app.vue中的默认代码后报错:Module Error (from ./node_modules/eslint-loader/index.js): 解决方案:关闭ESlint代码检测,在项目根目录创建vue.config.js,在文件中添加 module.exports = { lin
查看spark默认的python版本 [root@master day27]# pyspark /home/software/spark-2.3.4-bin-hadoop2.7/conf/spark-env.sh: line 2: /usr/local/hadoop/bin/hadoop: No s
使用本地python环境可以成功执行 import pandas as pd import matplotlib.pyplot as plt # 设置字体 plt.rcParams[&#39;font.sans-serif&#39;] = [&#39;SimHei&#39;] # 能正确显示负号 p
错误1:Request method ‘DELETE‘ not supported 错误还原:controller层有一个接口,访问该接口时报错:Request method ‘DELETE‘ not supported 错误原因:没有接收到前端传入的参数,修改为如下 参考 错误2:cannot r
错误1:启动docker镜像时报错:Error response from daemon: driver failed programming external connectivity on endpoint quirky_allen 解决方法:重启docker -&gt; systemctl r
错误1:private field ‘xxx‘ is never assigned 按Altʾnter快捷键,选择第2项 参考:https://blog.csdn.net/shi_hong_fei_hei/article/details/88814070 错误2:启动时报错,不能找到主启动类 #
报错如下,通过源不能下载,最后警告pip需升级版本 Requirement already satisfied: pip in c:\users\ychen\appdata\local\programs\python\python310\lib\site-packages (22.0.4) Coll
错误1:maven打包报错 错误还原:使用maven打包项目时报错如下 [ERROR] Failed to execute goal org.apache.maven.plugins:maven-resources-plugin:3.2.0:resources (default-resources)
错误1:服务调用时报错 服务消费者模块assess通过openFeign调用服务提供者模块hires 如下为服务提供者模块hires的控制层接口 @RestController @RequestMapping(&quot;/hires&quot;) public class FeignControl
错误1:运行项目后报如下错误 解决方案 报错2:Failed to execute goal org.apache.maven.plugins:maven-compiler-plugin:3.8.1:compile (default-compile) on project sb 解决方案:在pom.
参考 错误原因 过滤器或拦截器在生效时,redisTemplate还没有注入 解决方案:在注入容器时就生效 @Component //项目运行时就注入Spring容器 public class RedisBean { @Resource private RedisTemplate&lt;String
使用vite构建项目报错 C:\Users\ychen\work&gt;npm init @vitejs/app @vitejs/create-app is deprecated, use npm init vite instead C:\Users\ychen\AppData\Local\npm-