futex同步机制分析之三内核实现

article/2025/9/30 14:51:31

一、源码引入

前两篇从应用分析到了库,本篇到内核中看看,futex到底何方神圣?(Linux3.1.1) 先看一下futex.c和futex.h(kennel/futex.c linux/futex.h):
/*** struct futex_q - The hashed futex queue entry, one per waiting task* @list:		priority-sorted list of tasks waiting on this futex* @task:		the task waiting on the futex* @lock_ptr:		the hash bucket lock* @key:		the key the futex is hashed on* @pi_state:		optional priority inheritance state* @rt_waiter:		rt_waiter storage for use with requeue_pi* @requeue_pi_key:	the requeue_pi target futex key* @bitset:		bitset for the optional bitmasked wakeup** We use this hashed waitqueue, instead of a normal wait_queue_t, so* we can wake only the relevant ones (hashed queues may be shared).** A futex_q has a woken state, just like tasks have TASK_RUNNING.* It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.* The order of wakeup is always to make the first condition true, then* the second.** PI futexes are typically woken before they are removed from the hash list via* the rt_mutex code. See unqueue_me_pi().*/
struct futex_q {struct plist_node list; //内核中的链表节点,用于将数据进行管理struct task_struct \*task;//futex相关联的指针队列-线程或者进程spinlock_t \*lock_ptr;//自旋锁指针union futex_key key;//查找futex的keystruct futex_pi_state \*pi_state;  //控制优先级继承struct rt_mutex_waiter \*rt_waiter;union futex_key \*requeue_pi_key;u32 bitset; //常用的掩码匹配
};
英文注释写得非常清楚,画蛇添足,又增加了中文注释。
/** Priority Inheritance state:*/
struct futex_pi_state {/** list of 'owned' pi_state instances - these have to be* cleaned up in do_exit() if the task exits prematurely:*/struct list_head list;/** The PI object:\*/struct rt_mutex pi_mutex;struct task_struct \*owner;atomic_t refcount;union futex_key key;
};
//key是一个共用体
union futex_key {struct {unsigned long pgoff;struct inode \*inode;int offset;} shared;//futex地址结构体--不同进程间区别用struct {unsigned long address;struct mm_struct \*mm;int offset;} private;//地址结构体--同一进程间区别struct {unsigned long word;void \*ptr;int offset;} both;//共用结构体
};
它们在使用时上面结构体中的相关task会被挂载到一个哈希桶中:
/** Hash buckets are shared by all the futex_keys that hash to the same* location.  Each key may have multiple futex_q structures, one for each task* waiting on a futex.*/
struct futex_hash_bucket {spinlock_t lock;struct plist_head chain;
};static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
区别它们的办法就是通过上面的key来查找相关的futex的。看一下相关的futex进程应用模型:

在这里插入图片描述

futex的应用是要有一个init函数的:

static int __init futex_init(void)
{u32 curval;int i;/** This will fail and we want it. Some arch implementations do* runtime detection of the futex_atomic_cmpxchg_inatomic()* functionality. We want to know that before we call in any* of the complex code paths. Also we want to prevent* registration of robust lists in that case. NULL is* guaranteed to fault and we get -EFAULT on functional* implementation, the non-functional ones will return* -ENOSYS.\*/if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)futex_cmpxchg_enabled = 1;for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {plist_head_init(&futex_queues[i].chain);spin_lock_init(&futex_queues[i].lock);}return 0;
}
在前面的第二篇glibc的分析中提到了三种futex,在这里都可以找到,主要说明pi-futex: pi-futex,也就是Priority Inheritance,优先级继承,如果学习过操作系统原理的一定知道“优先级反转”这个名词,它就是用来解决优先级反转的一种办法。 看一下相关的代码,这里不全列出,如有兴趣可参看内核的代码:
static int wake_futex_pi(u32 \__user *uaddr, u32 uval, struct futex_q *this)
{struct task_struct *new_owner;struct futex_pi_state *pi_state = this->pi_state;u32 curval, newval;if (!pi_state)return -EINVAL;/** If current does not own the pi_state then the futex is* inconsistent and user space fiddled with the futex value.*/if (pi_state->owner != current)return -EINVAL;raw_spin_lock(&pi_state->pi_mutex.wait_lock);new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);/** It is possible that the next waiter (the one that brought* this owner to the kernel) timed out and is no longer* waiting on the lock.*/if (!new_owner)new_owner = this->task;/** We pass it to the next owner. (The WAITERS bit is always* kept enabled while there is PI state around. We must also* preserve the owner died bit.)*/if (!(uval & FUTEX_OWNER_DIED)) {int ret = 0;newval = FUTEX_WAITERS | task_pid_vnr(new_owner);if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))ret = -EFAULT;else if (curval != uval)ret = -EINVAL;if (ret) {raw_spin_unlock(&pi_state->pi_mutex.wait_lock);return ret;}}raw_spin_lock_irq(&pi_state->owner->pi_lock);WARN_ON(list_empty(&pi_state->list));list_del_init(&pi_state->list);raw_spin_unlock_irq(&pi_state->owner->pi_lock);raw_spin_lock_irq(&new_owner->pi_lock);WARN_ON(!list_empty(&pi_state->list));list_add(&pi_state->list, &new_owner->pi_state_list);pi_state->owner = new_owner;raw_spin_unlock_irq(&new_owner->pi_lock);raw_spin_unlock(&pi_state->pi_mutex.wait_lock);rt_mutex_unlock(&pi_state->pi_mutex);return 0;
}
static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
{u32 oldval;/** There is no waiter, so we unlock the futex. The owner died* bit has not to be preserved here. We are the owner:*/if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))return -EFAULT;if (oldval != uval)return -EAGAIN;return 0;
}
/*** requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue* @q:		the futex_q* @key:	the key of the requeue target futex* @hb:		the hash_bucket of the requeue target futex** During futex_requeue, with requeue_pi=1, it is possible to acquire the* target futex if it is uncontended or via a lock steal.  Set the futex_q key* to the requeue target futex so the waiter can detect the wakeup on the right* futex, but remove it from the hb and NULL the rt_waiter so it can detect* atomic lock acquisition.  Set the q->lock_ptr to the requeue target hb->lock* to protect access to the pi_state to fixup the owner later.  Must be called* with both q->lock_ptr and hb->lock held.\*/
static inline
void requeue_pi_wake_futex(struct futex_q \*q, union futex_key \*key,struct futex_hash_bucket \*hb)
{get_futex_key_refs(key);q->key = \*key;__unqueue_futex(q);WARN_ON(!q->rt_waiter);q->rt_waiter = NULL;q->lock_ptr = &hb->lock;wake_up_state(q->task, TASK_NORMAL);
}
从这里可以看出,内核针对不同的场景,设计了三种不同的futex,而上面的pi-futex正是用来处理优先级反转的一种,优先级反转是一种进程调用现象,在进程(线程)管理中,低优先级的进程(线程)掌握了高优先级的进程的同步资源,那么就需要处理一下优先级来达到尽快让高优先级的进程(线程)执行,解决优先级翻转问题有优先级天花板(priority ceiling)和优先级继承(priority inheritance)两种办法。内核中使用的是后者,即临时把低优先级的进程提高优先级,参与高优先级的共同执行,待释放资源后再将其降低到原来的优先级。 这里需要重点提一下requeue这个功能,这是为了更好的支持pthread_cond的broad_cast,用过条件变量的同学们可能都知道,这玩意儿一般会和一个mutex共同来使用,保证线程操作的顺序(底层通过cond和mutex的从属关系来达到),即它包含了两次等待,或两个队列等待。一个是cond信号的等待(队列),另一个是它所属的mutex锁的等待(队列)。这使得一次cond wait必须先释放mutex,然后依次等待cond信号和mutex。那这样的结果就是会造成两次的等待和唤醒,可是在广播唤醒时,只需唤醒一个线程就可以了,不需要惊醒所有的线程来竞争。所以这个requeue就是用来优化这部分的。其实就是将两个等待的唤醒队列优化成一次。 所以说,不读内核源码,你对线程的理解总是浮在上面的,无法掌握其本质。

二、调用方式

看一下对外的接口:

SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,struct timespec __user *, utime, u32 __user *, uaddr2,u32, val3)
{struct timespec ts;ktime_t t, *tp = NULL;u32 val2 = 0;int cmd = op & FUTEX_CMD_MASK;if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||cmd == FUTEX_WAIT_BITSET ||cmd == FUTEX_WAIT_REQUEUE_PI)) {if (copy_from_user(&ts, utime, sizeof(ts)) != 0)return -EFAULT;if (!timespec_valid(&ts))return -EINVAL;t = timespec_to_ktime(ts);if (cmd == FUTEX_WAIT)t = ktime_add_safe(ktime_get(), t);tp = &t;}/** requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.* number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.*/if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)val2 = (u32) (unsigned long) utime;return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,u32 __user *uaddr2, u32 val2, u32 val3)
{int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;unsigned int flags = 0;if (!(op & FUTEX_PRIVATE_FLAG))flags |= FLAGS_SHARED;if (op & FUTEX_CLOCK_REALTIME) {flags |= FLAGS_CLOCKRT;if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)return -ENOSYS;}switch (cmd) {case FUTEX_WAIT:val3 = FUTEX_BITSET_MATCH_ANY;case FUTEX_WAIT_BITSET:ret = futex_wait(uaddr, flags, val, timeout, val3);break;case FUTEX_WAKE:val3 = FUTEX_BITSET_MATCH_ANY;case FUTEX_WAKE_BITSET:ret = futex_wake(uaddr, flags, val, val3);break;case FUTEX_REQUEUE:ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);break;case FUTEX_CMP_REQUEUE:ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);break;case FUTEX_WAKE_OP:ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);break;case FUTEX_LOCK_PI:if (futex_cmpxchg_enabled)ret = futex_lock_pi(uaddr, flags, val, timeout, 0);break;case FUTEX_UNLOCK_PI:if (futex_cmpxchg_enabled)ret = futex_unlock_pi(uaddr, flags);break;case FUTEX_TRYLOCK_PI:if (futex_cmpxchg_enabled)ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);break;case FUTEX_WAIT_REQUEUE_PI:val3 = FUTEX_BITSET_MATCH_ANY;ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,uaddr2);break;case FUTEX_CMP_REQUEUE_PI:ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);break;default:ret = -ENOSYS;}return ret;
}
通过代码可以看到,系统调用会调用do_futex,而它又会通过op来产生cmd来确定具体的调用的FUTEX的形式。因为此处不是分析内核源码,所以点到为止。 三篇futex的文章,写得有些粗糙,但是如果跟踪库和内核仔细分析下去,估计还得写好多,这就不是分析futex的本意了。

三、总结

前面分析了futex的各种优点,它有什么缺点没?有,而且很坑,搞JAVA的如果搞得深的都遇到过这个问题:

https://github.com/torvalds/linux/commit/76835b0ebf8a7fe85beb03c75121419a7dec52f0,在get_futex_key_refs这个函数里,老大们疏忽了。

还有这个:

https://ma.ttias.be/linux-futex_wait-bug/

还有一个是拒绝服务的漏洞,好像跟它也有关系,好东西,不可能啥都好,是不是?

在这里插入图片描述


http://chatgpt.dhexx.cn/article/P83TRGcR.shtml

相关文章

futex同步机制分析之一应用

futex同步机制分析之一应用 一、多线程&#xff08;进程&#xff09;的同步机制 c编程中最难的部分有哪些&#xff0c;估计绝大多数人都会首先提出来是多线程&#xff08;进程&#xff09;编程。为什么多线程编程难呢&#xff1f;一个主要的原因就是多线程的同步。在多线程同步…

FUTEX_SWAP补丁分析-SwitchTo 如何大幅度提升切换性能?

作者简介 胡哲宁&#xff0c;西安邮电大学计算机科学与技术专业大二学生。 Google SwitchTo 由于协程本身对操作系统的不可见性&#xff0c;协程中出现的 BUG 往往不能通过一些已有的工具去排查。在谷歌内部有一套闭源的用户态任务调度框架 SwitchTo, 这个框架可以为谷歌提供延…

futex问答

一、什么是futex&#xff1f; futex是Fast Userspace muTEX的缩写&#xff0c;该机制是由Rusty Russell、Hubertus Franke和Mathew Kirkwood在2.5.7版本的内核中引入&#xff0c;虽然名字中有互斥锁&#xff08;mutex&#xff09;的含义&#xff0c;但实际它是一种用于用户空间…

Futex系统调用,Futex机制,及具体案例分析

Futex 1、背景1.1 自己实现锁1.1.1 自旋锁1.1.2 sleep自旋1.1.3 小结 1.2 futex1.2.1 什么是Futex1.2.2 futex诞生之前1.2.3 futex诞生之后 2、Futex系统调用3、Futex机制4、具体案例分析4.1 在Bionic中的实现4.2 C语言实现 5、参考及扩展阅读 首先要区分一下futex系统调用和fu…

深度讲解futex问答(上)

一、什么是futex&#xff1f; futex是Fast Userspace muTEX的缩写&#xff0c;该机制是由Rusty Russell、Hubertus Franke和Mathew Kirkwood在2.5.7版本的内核中引入&#xff0c;虽然名字中有互斥锁&#xff08;mutex&#xff09;的含义&#xff0c;但实际它是一种用于用户空间…

数组中的字节数

##sizeof查看定义的数组所占用字节数 #include<iostream> int main(){using namespace std;int Inter [10];short sh [10];char ch [10];long lg [10];float fl [10];double dou [10];cout << "int style has " << sizeof Inter << " …

int转byte数组以及相关原理

前言 本文由int转byte数组这样的题目代码引发的思考&#xff0c;其中涉及到多个让我混淆的地方。 直接上代码 public byte[] toBytes(int number){byte[] bytes new byte[4];bytes[3] (byte)number;bytes[2] (byte) ((number >> 8) & 0xFF);bytes[1] (byte) ((…

java的byte数组的不同写法

经常看到java中对byte数组的不同定义&#xff0c;粗略整理的一下&#xff1a; 一个字节&#xff08;byte&#xff09;&#xff1d;8位&#xff08;bit&#xff09;&#xff0c;“byte数组”里面全部是“byte”&#xff0c;即每一个byte都可以用二进制、十六进制、十进制来表示。…

byte数组快速拷贝,byte数组合并,System.arraycopy详解

博客来源&#xff1a; 项目过程中用到byte[]数组相加问题&#xff0c;给出两个byte[] 需要合并成一个byte[]进行计算…那么需求来了……数据量达10W级&#xff0c;怎么合并 调用系统自带方法&#xff08;System.arraycopy&#xff09; 参考程序 org.junit.Test public void f…

程序、进程、线程的区别

程序、进程、线程的区别 进程是程序的实体&#xff0c;而线程又是进程的实体。进程又是线程的容器。 程序、进程、线程三者区别如下: 1.程序&#xff1a;程序并不能单独执行&#xff0c;是静止的&#xff0c;只有将程序加载到内存中&#xff0c;系统为其分配资源后才能够执…

操作系统-进程与线程的区别

操作系统-进程与线程的区别 1.什么是进程 简单的讲&#xff0c;进程是执行的程序&#xff0c;资源分配的最小单位。 进程包括&#xff1a;文本段&#xff08;程序代码&#xff09;、程序计数器的值、CPU寄存器的内容、堆、栈、数据段 进程的状态&#xff1a;新的、就绪、运行…

进程和线程的区别(重点)

来源&#xff1a;http://www.cnblogs.com/lmule/archive/2010/08/18/1802774.html 简而言之,一个程序至少有一个进程,一个进程至少有一个线程. 线程的划分尺度小于进程&#xff0c;使得多线程程序的并发性高。 另外&#xff0c;进程在执行过程中拥有独立的内存单元&#xff0c…

进程与线程的区别和联系

程序并不能单独执行&#xff0c;只有将程序加载到内存中&#xff0c;系统为他分配资源后才能够执行&#xff0c;这种执行的程序称之为进程&#xff0c;也就是说进程是系统进行资源分配和调度的一个独立单位&#xff0c;每个进程都有自己单独的地址空间。所以说程序与进程的区别…

进程和线程的区别和联系

我们都知道计算机的核心是CPU,它承担了所有的计算任务,而操作系统是计算机的管理者,它负责任务的调度,资源的分配和管理,统领整个计算机硬件;应用程序是具有某种功能的程序,程序是运行于操作系统之上的。 进程 进程是一个具有一定独立功能的程序在一个数据集上的一次动…

进程与线程的区别及联系

目录 1. 操作系统功能简介 2. 进程 2.1 认识进程 2.2 进程操作系统中如何管理 2.3 PCB如何描述 2.3.1 pid 2.3.2 内存指针 2.3.3 文件描述符表 2.3.4 进程调度相关属性 3. 内存管理 4. 线程 4.1 认识线程 4.2 进程与线程的关系 4.3 线程安全问题 1.操作系统功能简…

Linux进程与线程的区别

进程与线程的区别&#xff0c;早已经成为了经典问题。自线程概念诞生起&#xff0c;关于这个问题的讨论就没有停止过。无论是初级程序员&#xff0c;还是资深专家&#xff0c;都应该考虑过这个问题&#xff0c;只是层次角度不同罢了。一般程序员而言&#xff0c;搞清楚二者的概…

win10安装时,提示“我们无法创建新的分区,也找不到现有分区”

win10安装时&#xff0c;提示“我们无法创建新的分区&#xff0c;也找不到现有分区”&#xff0c;如图所示&#xff1a; 解决办法&#xff1a; 将win10安装包&#xff08;ios文件&#xff09;解压&#xff0c;将以下文件复制到系统盘&#xff0c;然后重启电脑&#xff0c;自动…

我们无法创建新分区。【错误:0x80042468】

一台服务器6块1.8T SAS 10K&#xff0c;做RAID10. 安装windows server 2012R2系统在分区时报错&#xff0c;总有个分区不能创建成功。&#xff08;正常安装系统后&#xff0c;磁盘管理也有一个磁盘不能创建新的分区&#xff09; 提示“我们无法创建新分区。【错误&#xff1a;0…

重装系统“无法创建新的分区也找不到现有分区”

如题&#xff0c;一开始我使用分区工具对硬盘进行分区后再进入安装系统&#xff0c;一直出现这个问题报错。 (因我没拍照片&#xff0c;从网上找的图片&#xff0c;侵权请联系我) 后来我删除硬盘分区&#xff0c;直接在这里进行分区&#xff0c;就可以装了。如图&#xff0c…