上一篇介绍了文件打开open()操作,在open()操作中,已经获取到了文件描述符fd,fd本质就是file对象的下标,通过fd可以直接得到缓存中的file对象,通过file对象就可以进行文件的写操作write()。
一、示例代码
以下代码功能就是在当前目录下,打开一个文件fout.txt,然后在文件中写入一个学生的信息,调用的写入函数原型如下:
size_t write(unsigned int fd, const char *str, size_t count)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
struct student
{
char name[33];
int age;
double score;
} stu = {"King Garry", 24, 93.5};
int main()
{
int fd;
char fname[] = "/fout.txt";
char cwd[129];
char stus[64] = { 0 };
if(getcwd(cwd, sizeof(cwd)) != NULL) {
printf("Curr dir: %s\n", cwd);
} else {
perror("Err info: ");
}
strcat(cwd, fname);
fd = open(cwd, O_CREAT | O_WRONLY, 0755);
if (fd == -1) {
printf("Open file [%s] fail!\n", cwd);
perror("Err info: ");
}
sprintf(stus, "%s-%d-%.2f", stu.name, stu.age, stu.score);
write(fd, stus, strlen(stus));
close(fd);
printf("Finish\n");
return 0;
}
二、系统调用入口
write()函数属于系统调用,入口在read_write.c中,基本代码逻辑如下:
- 根据整型fd文件描述符获取struct fd对象
- 调用vfs_write()完成文件写入
- 返回写入结果码
// read_write.c
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
return ksys_write(fd, buf, count);
}
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
loff_t pos, *ppos = file_ppos(f.file);
if (ppos) {
pos = *ppos;
ppos = &pos;
}
ret = vfs_write(f.file, buf, count, ppos);
if (ret >= 0 && ppos)
f.file->f_pos = pos;
fdput_pos(f);
}
return ret;
}
三、int fd转成struct fd
将int fd转化为struct fd,其实就是查询fd下标对应的file*对象,然后再将file*对象转化成fd*对象,具体步骤如下:
- 使用current宏,获取当前进程打开的文件对象列表files_struct,也就是current->files(current在X86和ARM架构实现方式不一致)
- 然后fdt表中,查找fd下标的file对象,然后返回
- 上层将file对象转化成struct fd
// file.c
static unsigned long __fget_light(unsigned int fd, fmode_t mask)
{
/** 获取当前进程的所有打开文件 */
struct files_struct *files = current->files;
struct file *file;
if (atomic_read(&files->count) == 1) {
file = files_lookup_fd_raw(files, fd);
if (!file || unlikely(file->f_mode & mask))
return 0;
return (unsigned long)file;
} else {
file = __fget(fd, mask, 1);
if (!file)
return 0;
return FDPUT_FPUT | (unsigned long)file;
}
}
/*
* The caller must ensure that fd table isn't shared or hold rcu or file lock
*/
static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd)
{
struct fdtable *fdt = rcu_dereference_raw(files->fdt);
if (fd < fdt->max_fds) {
fd = array_index_nospec(fd, fdt->max_fds);
// 将fd作为下标,获取文件对象
return rcu_dereference_raw(fdt->fd[fd]);
}
return NULL;
}
四、执行写入操作
前面第三步已经获取了文件对象file*,在写文件时,直接调用file操作表中的操作,最终完成内容的写入,主要步骤如下:
- 校验f_mode
- 校验pos和count
- 调用file_start_write()函数,此函数最终会down超级块的读写信号量:sb->s_writers.rw_sem
- 如果f_op->write存在,则调用此函数进行写入,否则,如果file->f_op->write_iter存在,则调用new_sync_write()函数进行写入
- 调用fsnotify_modify()通知父目录文件变化
- 调用add_wchar()将写入的数据量,更新到进程的struct task_io_accounting的wchar字段
- 调用inc_syscw()将写入系统调用次数,更新到进程的struct task_io_accounting的syscw字段
- 调用file_end_write()函数,此函数最终会up超级块的读写信号量:sb->s_writers.rw_sem,与file_start_write()函数对应
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
if (unlikely(!access_ok(buf, count)))
return -EFAULT;
ret = rw_verify_area(WRITE, file, pos, count);
if (ret)
return ret;
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
file_start_write(file);
if (file->f_op->write)
ret = file->f_op->write(file, buf, count, pos);
else if (file->f_op->write_iter)
ret = new_sync_write(file, buf, count, pos);
else
ret = -EINVAL;
if (ret > 0) {
fsnotify_modify(file);
add_wchar(current, ret);
}
inc_syscw(current);
file_end_write(file);
return ret;
}
static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
struct kiocb kiocb;
struct iov_iter iter;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = (ppos ? *ppos : 0);
iov_iter_init(&iter, WRITE, &iov, 1, len);
ret = call_write_iter(filp, &kiocb, &iter);
BUG_ON(ret == -EIOCBQUEUED);
if (ret > 0 && ppos)
*ppos = kiocb.ki_pos;
return ret;
}
static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio,
struct iov_iter *iter)
{
return file->f_op->write_iter(kio, iter);
}
注1:new_sync_write()迭代写
这个函数就是实现迭代IO,对于一段大的缓冲区,可以使用迭代写入到磁盘,先看下两个主要数据结构:
1)struct iovec & iov_iter
iovec:给定了要写入的缓冲区和长度
iov_iter:用于描述一次IO写入进度,*iov指向缓冲区,iov_offset表示写入的偏移量,count表示写入的数量,这两个字段会随着写入不断变化
// iovec & iov_iter
struct iovec
{
void __user *iov_base; /* BSD uses caddr_t (1003.1g requires void *) */
__kernel_size_t iov_len; /* Must be size_t (1003.1g) */
};
struct iov_iter {
u8 iter_type;
bool nofault;
bool data_source;
size_t iov_offset;
size_t count;
union {
const struct iovec *iov;
const struct kvec *kvec;
const struct bio_vec *bvec;
struct xarray *xarray;
struct pipe_inode_info *pipe;
};
union {
unsigned long nr_segs;
struct {
unsigned int head;
unsigned int start_head;
};
loff_t xarray_start;
};
};
2)struct kiocb
从文件侧来看IO写入情况,ki_filp是file指针,ki_pos读写的偏移量
struct kiocb {
struct file *ki_filp;
/* The 'ki_filp' pointer is shared in a union for aio */
randomized_struct_fields_start
loff_t ki_pos;
void (*ki_complete)(struct kiocb *iocb, long ret);
void *private;
int ki_flags;
u16 ki_hint;
u16 ki_ioprio; /* See linux/ioprio.h */
struct wait_page_queue *ki_waitq; /* for async buffered IO */
randomized_struct_fields_end
};
注2:call_write_iter()
这个函数最终调用file指针的f_op操作表中的write_iter()完成迭代写。
file->f_op->write_iter(kio, iter)
ext2示例:
static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
#ifdef CONFIG_FS_DAX
if (IS_DAX(iocb->ki_filp->f_mapping->host))
return ext2_dax_write_iter(iocb, from);
#endif
return generic_file_write_iter(iocb, from);
}
参考资料:官方文档