前面介绍了文件系统的各种结构体,那么一次打开文件过程,需要和哪些结构体产生联系呢?要解决上面提出的问题,就要先搞清楚,函数是调用流程是怎样的,调用过程中串联了哪些数据结构,说起来也就是回答如下几个疑问:
- 打开文件的内核入口在哪里?
- 打开文件时,如何知道当前属于哪个文件系统呢?
- 如果文件已存在,那又如何获取目录项dentry和索引节点inode呢?
- 文件对象file是怎么构造的?
- 文件描述符是怎么产生的,又是如何跟file对象关联起来?
一、代码示例
以下代码示例,描述了一个打开文件、写文件、关闭文件的过程。为了避开libc的影响,没有使用C语言的fopen()打开文件,而是在代码中直接使用了系统调用open()。代码编译后在用户态运行,通过系统调用会进入内核态。
简单说明一下,本文不会介绍系统调用表如何生成,会直接从内核函数开始,关于系统调用表,在其他文章中会详细介绍。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
struct student
{
char name[33];
int age;
double score;
} stu = {"King Garry", 24, 93.5};
int main()
{
int fd;
char fname[] = "/fout.txt";
char cwd[129];
char stus[64] = { 0 };
if(getcwd(cwd, sizeof(cwd)) != NULL) {
printf("Curr dir: %s\n", cwd);
} else {
perror("Err info: ");
}
strcat(cwd, fname);
fd = open(cwd, O_CREAT | O_WRONLY, 0755);
if (fd == -1) {
printf("Open file [%s] fail!\n", cwd);
perror("Err info: ");
}
sprintf(stus, "%s-%d-%.2f", stu.name, stu.age, stu.score);
write(fd, stus, strlen(stus));
close(fd);
printf("Finish\n");
return 0;
}
open()函数原型:
int open(const char *pathname, int flags);
int open(const char *pathname, int flags, mode_t mode);
二、打开文件
打开文件的详细流程图,参考文章开头的图片。
1.进入内核入口
在用户态调用open()函数时,会进入内核如下代码,执行do_sys_open()函数。参数有filename、flags、mode这些都是用户态传入的,另外还多了一个参数AT_FDCWD,本文先不做介绍。
紧接着在调用do_sys_openat2()函数之前,调用了build_open_how()函数,将flags和mode两个参数,整合成了struct open_how类型。内核入口就这么多,我们继续往下看。
// fs/open.c
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
if (force_o_largefile())
flags |= O_LARGEFILE;
return do_sys_open(AT_FDCWD, filename, flags, mode);
}
long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
struct open_how how = build_open_how(flags, mode);
return do_sys_openat2(dfd, filename, &how);
}
2.获取文件描述符
接下来在do_sys_openat2()函数中,又将struct open_how类型,转换成struct open_flags类型,具体做了什么,这里不做详细介绍。
static long do_sys_openat2(int dfd, const char __user *filename,
struct open_how *how)
{
struct open_flags op;
int fd = build_open_flags(how, &op);
struct filename *tmp;
if (fd)
return fd;
tmp = getname(filename);
if (IS_ERR(tmp))
return PTR_ERR(tmp);
fd = get_unused_fd_flags(how->flags);
if (fd >= 0) {
struct file *f = do_filp_open(dfd, tmp, &op);
if (IS_ERR(f)) {
put_unused_fd(fd);
fd = PTR_ERR(f);
} else {
fsnotify_open(f);
fd_install(fd, f);
}
}
putname(tmp);
return fd;
}
下面列出这段代码最重要的几个函数是:
1)do_filp_open():执行打开文件,获取文件对象指针file*,下面会详细讲解,这里不做介绍
2)get_unused_fd_flags():获取文件描述符fd:
文件描述符是通过alloc_fd()函数分配,这个函数里根据struct fdtable结构体中unsigned long *full_fds_bits; 这个字段的来生成fd。
full_fds_bits是一个long型数组,每个元素二进制是32位或64位(取决于系统类型),在内核当中也叫位图Bitmap。每个进程都有自己的file对象数组,对应就有一个文件描述符fd位图。假设full_fds_bits是64行,每行64个位,这样就有4096个fd可用,也就是说每个文件打开的文件数(使用ulimit -n xxx可修改)。
在分配fd时,从full_fds_bits数组当中逐行检查,寻找这一行所有位当中,是否有哪一位是0,有的话把对应的位置作为fd,然后标志成1,表示busy。如果当前行的位全是1,那就继续到下一行寻找,直到直到找到为止。实现函数find_next_zero_bit()。
/*
* allocate a file descriptor, mark it busy.
*/
static int alloc_fd(unsigned start, unsigned end, unsigned flags)
{
struct files_struct *files = current->files;
unsigned int fd;
int error;
struct fdtable *fdt;
spin_lock(&files->file_lock);
repeat:
fdt = files_fdtable(files);
fd = start;
if (fd < files->next_fd)
fd = files->next_fd;
if (fd < fdt->max_fds)
fd = find_next_fd(fdt, fd);
......
}
/**
* find_next_zero_bit - find the next cleared bit in a memory region
* @addr: The address to base the search on
* @offset: The bitnumber to start searching at
* @size: The bitmap size in bits
*
* Returns the bit number of the next zero bit
* If no bits are zero, returns @size.
*/
static inline
unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
unsigned long offset)
{
if (small_const_nbits(size)) {
unsigned long val;
if (unlikely(offset >= size))
return size;
val = *addr | ~GENMASK(size - 1, offset);
return val == ~0UL ? size : ffz(val);
}
return _find_next_bit(addr, NULL, size, offset, ~0UL, 0);
}
3)fd_install():打开文件后安装fd和file*,也就是关联fd和文件对象指针,其实最关键操作就是rcu_assign_pointer(fdt->fd[fd], file),将文件对象file赋值给fd下标的元素。
void fd_install(unsigned int fd, struct file *file)
{
struct files_struct *files = current->files;
struct fdtable *fdt;
rcu_read_lock_sched();
if (unlikely(files->resize_in_progress)) {
rcu_read_unlock_sched();
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
BUG_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
spin_unlock(&files->file_lock);
return;
}
/* coupled with smp_wmb() in expand_fdtable() */
smp_rmb();
fdt = rcu_dereference_sched(files->fdt);
BUG_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
rcu_read_unlock_sched();
}
3.路径解析获取文件dentry***
打开文件过程非常重要一步,就是解析文件路径分量,用户态传递的文件路径可能是绝对的,也可能是相对的,解析路径分量目的就是识别完整路径,最终获取文件的父目录dentry,整个过程一共分为如下4个步骤:
- 构造file文件对象结构体(函数:alloc_empty_file()),后续会填充属性信息
- 构造nameidata结构体,用于后续的路径分量解析时,记录过程数据
- path初始化,从当前的进程的task_struct结构里,获取工作目录的path结构(主要使用dentry)
- 然后逐级路径分量解析,直到当前文件的父目录,获取父目录的dentry信息,存储到nameidata的path域,同时把最后解析出来路径分量文件名(就是要打开的文件),存储到nameidata的last域
- 根据父目录dentry信息,如果是已存在文件,就查询当前文件的dentry项;如果是新文件,那么就构造新的dentry,同时调用inode的操作create创建文件
1)构造nameidata结构体:
nameidata这个结构体,前四个字段,记录了当前目录path对象、文件名、根path对象、以及当前目录的索引节点。
struct nameidata {
struct path path;
struct qstr last;
struct path root;
struct inode *inode; /* path.dentry.d_inode */
unsigned int flags, state;
unsigned seq, m_seq, r_seq;
int last_type;
unsigned depth;
int total_link_count;
struct saved {
struct path link;
struct delayed_call done;
const char *name;
unsigned seq;
} *stack, internal[EMBEDDED_LEVELS];
struct filename *name;
struct nameidata *saved;
unsigned root_seq;
int dfd;
kuid_t dir_uid;
umode_t dir_mode;
} __randomize_layout;
static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name)
{
struct nameidata *old = current->nameidata;
p->stack = p->internal;
p->depth = 0;
p->dfd = dfd;
p->name = name;
p->path.mnt = NULL;
p->path.dentry = NULL;
p->total_link_count = old ? old->total_link_count : 0;
p->saved = old;
current->nameidata = p;
}
2)path初始化
以下就是path初始化代码片段,最重要代码就是从current中获取fs_struct结构体,这样就得到当前进程工作目录的path对象(fs->pwd),也就是文件的父目录的path对象,进而就得到了dentry项和索引节点,然后把这些信息赋值给nameidata结构nd,供后续使用。
那current是什么?看内核代码可知,在X86环境下,current是个per-cpu变量,栈顶就是用户态进程结构task_struct;而在Arm环境中,这个结构放在寄存器sp0里,可以从寄存器快速获取。
static __always_inline struct task_struct *get_current(void)
{
return this_cpu_read_stable(current_task);
}
/* must be paired with terminate_walk() */
static const char *path_init(struct nameidata *nd, unsigned flags)
{
int error;
const char *s = nd->name->name;
......
/* Relative pathname -- get the starting-point it is relative to. */
if (nd->dfd == AT_FDCWD) {
if (flags & LOOKUP_RCU) {
struct fs_struct *fs = current->fs;
unsigned seq;
do {
seq = read_seqcount_begin(&fs->seq);
nd->path = fs->pwd;
nd->inode = nd->path.dentry->d_inode;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
} while (read_seqcount_retry(&fs->seq, seq));
} else {
get_fs_pwd(current->fs, &nd->path);
nd->inode = nd->path.dentry->d_inode;
}
}
......
}
3)逐级路径分量解析
路径分量解析是整个打开文件的核心,代码逻辑非常复杂,涉及多个函数嵌套调用:link_path_walk(大循环)->walk_component(处理单个分量)->step_into(深入处理当前分量)->handle_mount(应对挂载点)->pick_link(处理软连接)等等。
但是,这个步骤的目标是清晰的,一共有两个:一个是要获得打开文件父目录的目录项detnry,围绕这个目标,路径解析要应对很多复杂情况,下面罗列一下:
- 路径中可能有.(上层目录)或..(上上层目录),需要进行Jump操作
- 路径中某个目录可能是另外一个文件系统挂载点,要跟踪这个挂载信息,切换文件系统
- 路径中某个分量可能是软链接symlink,需要跟踪这个链接到目标
为了理解整个过程,这里画了一个处理流程图,这是代码调用的大体流程,稍微简化了一点,实际代码中包含了很多判断逻辑更加复杂:
/*
* Name resolution.
* This is the basic name resolution function, turning a pathname into
* the final dentry. We expect 'base' to be positive and a directory.
*
* Returns 0 and nd will have valid dentry and mnt on success.
* Returns error and drops reference to input namei data on failure.
*/
static int link_path_walk(const char *name, struct nameidata *nd)
{
int depth = 0; // depth <= nd->depth
int err;
nd->last_type = LAST_ROOT;
nd->flags |= LOOKUP_PARENT;
while (*name=='/') // 去除前面的斜杠
name++;
/* 此处开始大循环,进行路径分量解析,代码叫path component. */
for(;;) {
struct user_namespace *mnt_userns;
const char *link;
u64 hash_len;
int type;
mnt_userns = mnt_user_ns(nd->path.mnt);
// 获取当前分量hash长度
hash_len = hash_name(nd->path.dentry, name);
// 检查是否有.或..,如果有要进行JUMP,记录到nd->state里
type = LAST_NORM;
if (name[0] == '.') switch (hashlen_len(hash_len)) {
case 2:
if (name[1] == '.') {
type = LAST_DOTDOT;
nd->state |= ND_JUMPED;
}
break;
case 1:
type = LAST_DOT;
}
......
nd->last.hash_len = hash_len;
nd->last.name = name;
nd->last_type = type;
name += hashlen_len(hash_len);
if (!*name) // 如果路径结束了,goto OK 进行解析
goto OK;
......
if (unlikely(!*name)) { // 已经是最后一个component了
OK:
/* pathname or trailing symlink, done */
if (!depth) {
nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode);
nd->dir_mode = nd->inode->i_mode;
nd->flags &= ~LOOKUP_PARENT;
return 0;
}
/* last component of nested symlink */
name = nd->stack[--depth].name; // 记录文件名
link = walk_component(nd, 0); // 解析当前路径分量,获取dentry信息
} else {
/* not the last component */
link = walk_component(nd, WALK_MORE); // 不是最后一个component
}
......
}
}
static const char *walk_component(struct nameidata *nd, int flags)
{
struct dentry *dentry;
struct inode *inode;
unsigned seq;
/*
* "." and ".." are special - ".." especially so because it has
* to be able to know about the current root directory and
* parent relationships.
*/
if (unlikely(nd->last_type != LAST_NORM)) {
if (!(flags & WALK_MORE) && nd->depth)
put_link(nd);
return handle_dots(nd, nd->last_type);
}
dentry = lookup_fast(nd, &inode, &seq);
......
if (!(flags & WALK_MORE) && nd->depth)
put_link(nd);
return step_into(nd, flags, dentry, inode, seq);
}
4)搜索(或构造)打开文件的dentry
经过第3步的路径解析,已经获取了父目录的的dentry项,有了父目录的dentry项,是不是就可以搜索要打开文件的dentry了。经过前面几个步骤的铺垫,这才刚刚进入正题.
接下来就是调用open_last_lookups()函数,搜索打开文件的Dentry,如果未搜索到,将会构造一个dentry项,一起来看下代码:
- open_last_lookups()函数中,会先给nameidata结构path域中dentry项,对应的inode加锁,然后调用lookup_open()函数
- lookup_open()函数中,使用传入的path域,作为文件的父dentry,last域作为文件名,调用d_lookup()函数,搜索文件的dentry
- 检查搜索结果,如果dentry不存在,使用d_alloc_parallel()构造一个dentry,然后调用父目录的dir_inode->i_op->create()创建文件,注意参数是父项inode,当前文件dentry。
static const char *open_last_lookups(struct nameidata *nd,
struct file *file, const struct open_flags *op)
{
struct dentry *dir = nd->path.dentry;
int open_flag = op->open_flag;
bool got_write = false;
unsigned seq;
struct inode *inode;
struct dentry *dentry;
const char *res;
nd->flags |= op->intent;
......
if (open_flag & O_CREAT)
inode_lock(dir->d_inode);
else
inode_lock_shared(dir->d_inode);
dentry = lookup_open(nd, file, op, got_write);
if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
fsnotify_create(dir->d_inode, dentry);
if (open_flag & O_CREAT)
inode_unlock(dir->d_inode);
else
inode_unlock_shared(dir->d_inode);
if (got_write)
mnt_drop_write(nd->path.mnt);
......
return res;
}
/*
* Look up and maybe create and open the last component.
*
* Must be called with parent locked (exclusive in O_CREAT case).
*
* Returns 0 on success, that is, if
* the file was successfully atomically created (if necessary) and opened, or
* the file was not completely opened at this time, though lookups and
* creations were performed.
* These case are distinguished by presence of FMODE_OPENED on file->f_mode.
* In the latter case dentry returned in @path might be negative if O_CREAT
* hadn't been specified.
*
* An error code is returned on failure.
*/
static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
const struct open_flags *op,
bool got_write)
{
struct user_namespace *mnt_userns;
struct dentry *dir = nd->path.dentry;
struct inode *dir_inode = dir->d_inode;
int open_flag = op->open_flag;
struct dentry *dentry;
int error, create_error = 0;
umode_t mode = op->mode;
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
if (unlikely(IS_DEADDIR(dir_inode)))
return ERR_PTR(-ENOENT);
file->f_mode &= ~FMODE_CREATED;
dentry = d_lookup(dir, &nd->last);
for (;;) {
if (!dentry) {
dentry = d_alloc_parallel(dir, &nd->last, &wq);
}
......
}
......
/* Negative dentry, just create the file */
if (!dentry->d_inode && (open_flag & O_CREAT)) {
file->f_mode |= FMODE_CREATED;
......
error = dir_inode->i_op->create(mnt_userns, dir_inode, dentry,
mode, open_flag & O_EXCL);
if (error)
goto out_dput;
}
if (unlikely(create_error) && !dentry->d_inode) {
error = create_error;
goto out_dput;
}
return dentry;
......
}
5.打开文件,获取file对象
经过上面充分的准备,终于来到了最后一个操作:打开文件。
这一步有几个函数调用:do_open() -> vfs_open() -> do_dentry_open(),其中最重要操作就在do_dentry_open中,这个函数最终调用file对象操作表中的open()函数打开文件。如果分步来看的话,一共以下几个步骤:
- 给文件对象file的属性赋值:f_inode 、f_mapping、f_wb_err、f_sb_err等
- 将inode的i_fop域赋值给file对象的f_op域,那i_fop域的值是哪来的呢?这个值是,文件系统注册到VFS时,携带给inode的操作表,里面包含了打开文件的open()函数。
- 调用file对象的f_op->open()函数打开文件
- 给文件对象file的属性赋值:f_mode、f_write_hint 、f_ra等
注意:关联fd和file对象,已经在第2步调用处介绍,此部分不再赘述
/*
* Handle the last step of open()
*/
static int do_open(struct nameidata *nd,
struct file *file, const struct open_flags *op)
{
......
if (!error && !(file->f_mode & FMODE_OPENED))
error = vfs_open(&nd->path, file);
......
return error;
}
/**
* vfs_open - open the file at the given path
* @path: path to open
* @file: newly allocated file with f_flag initialized
* @cred: credentials to use
*/
int vfs_open(const struct path *path, struct file *file)
{
file->f_path = *path;
return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
}
static int do_dentry_open(struct file *f,
struct inode *inode,
int (*open)(struct inode *, struct file *))
{
static const struct file_operations empty_fops = {};
int error;
path_get(&f->f_path);
f->f_inode = inode;
f->f_mapping = inode->i_mapping;
f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
f->f_sb_err = file_sample_sb_err(f);
......
f->f_op = fops_get(inode->i_fop);
/* normally all 3 are set; ->open() can clear them if needed */
f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
if (!open)
open = f->f_op->open;
if (open) {
error = open(inode, f);
if (error)
goto cleanup_all;
}
f->f_mode |= FMODE_OPENED;
if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_inc(inode);
if ((f->f_mode & FMODE_READ) &&
likely(f->f_op->read || f->f_op->read_iter))
f->f_mode |= FMODE_CAN_READ;
if ((f->f_mode & FMODE_WRITE) &&
likely(f->f_op->write || f->f_op->write_iter))
f->f_mode |= FMODE_CAN_WRITE;
f->f_write_hint = WRITE_LIFE_NOT_SET;
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
.....
return error;
}
三、总结
这一部分总结一下打开文件的几个步骤:生成文件描述符、构造文件对象、路径分量解析、最终打开文件。产生联系的重要数据结构有:
- struct file
- struct path
- struct task_struct
- struct fs_struct
- struct nameidata
- struct dentry
- struct inode
接下来回答一下开头提出的问题:
- 打开文件的内核入口在哪里?–do_sys_open()函数
- 打开文件时,如何知道当前属于哪个文件系统呢?–根据getcurrent()函数,从进程的task_struct结构的fs域,通过fs->pwd获取
- 如果文件已存在,那又如何获取目录项dentry和索引节点inode呢?–在路径解析步骤,通过传递父dentry和文件名搜索出文件的dentry,进而得到inode
- 文件对象file是怎么构造的?–通过函数alloc_empty_file()构造,然后不断填充属性信息
- 文件描述符是怎么产生的,又是如何跟file对象关联起来?–文件描述符fd,就是位图中为0的位置,通过fdtable与fd关联
上一篇:Linux文件系统数据结构详解:文件对象struct file
源码:5.16.7
《011 Linux一次文件打开过程open()》有2个想法