Contents
背景
在 RHEL7 中, dovecot 在运行的过程中其中一个 imap 进程挂掉,出来这么一个 coredump.
(gdb) f 0
#0 0x00007fbb15868c17 in mail_cache_transaction_open_if_needed (ctx=ctx@entry=0x7fbb173c1430)
at mail-cache-transaction.c:218
218 if (ext->reset_id == cache->hdr->file_seq || i == 2)
(gdb) p cache
$1 = (struct mail_cache *) 0x7fbb173af2d0
(gdb) p cache->hdr
$3 = (const struct mail_cache_header *) 0x7fbb15ce5000
(gdb) p cache->hdr->file_seq
Cannot access memory at address 0x7fbb15ce5008
# cat maps | grep 7fbb15ce5000
7fbb15ce5000-7fbb15ced000 r--s 00000000 00:2c xxxxxxxxx /xxxxxxx/dovecot.index.cache
挂掉的原因是 "signal 7, bus error" ((bad memory access)). 从 gdb 中可以看到,进程挂的时候在尝试访问 ext->reset_id 和 cache->hdr->file_seq, 而从 maps 可以看出 cache->hdr 指向的是 file-backend 的地址空间。
那么问题来了,怎样可以把一个结构体,映射到一个文件,而不是匿名页?另外一个问题是,在 gdb 中看到 “Cannot access memory at address” 是意味着这段地址存在问题吗?
测试环境
Debian 8 - jessie
测试程序
从 https://stackoverflow.com/questions/27697228/mmap-and-struct-in-c 中,找到了一段将数据映射到文件的可行代码。下面根据实际需要进行了改动。
这些 man page 会有所帮助: man 2 lseek, man 2 mmap, man 2 open, man 2 write.
写 - 将一个 struct person 存到文件中
写一个 mmtest.c 程序。这个程序会创建一个 /tmp/tom.bin 文件,并将一个 struct person 的内容存放到这个文件中。
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#define FILEPATH "/tmp/tom.bin"
// Define an arbitrary data structure.
struct person
{
int id;
int height;
int weight;
};
int main(int argc, char** argv)
{
int fd;
int result;
struct person *tom;
// Create a file.
fd = open(FILEPATH, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600);
if (fd == -1)
{
perror("Error opening file for writing");
exit(EXIT_FAILURE);
}
// Reserve enough space to store person struct.
result = lseek(fd, sizeof(struct person) - 1, SEEK_SET);
if (result == -1)
{
close(fd);
perror("Error calling lseek() to 'stretch' the file");
exit(EXIT_FAILURE);
}
// Write a '\0' in the end of the file.
// Now the file size == sizeof(struct person)
result = write(fd, "", 1);
if (result != 1)
{
close(fd);
perror("Error writing last byte of the file");
exit(EXIT_FAILURE);
}
// Map tom to the file. Now tom is a pointer to the memory address that mapped to the file.
tom = (struct person *)mmap(0, sizeof(struct person), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (tom == MAP_FAILED)
{
close(fd);
perror("Error mmapping the file");
exit(EXIT_FAILURE);
}
// Finally, we can write something to the struct.
tom -> id = 5;
tom -> height = 180;
tom -> weight = 65;
// Buy some time for investigation.
sleep(300);
return 0;
}
可以看到,这个 tom.bin 文件里存放了 id=5, height=180, weight=65 的信息。
0000000 0005 0000 00b4 0000 0041 0000
000000c
读 - 从文件中读取 struct person
另一个程序 mmtest_read.c, 它通过 mmap() 读取这个文件存放的内容。
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#define FILEPATH "/tmp/tom.bin"
// Define a person struct which matches the mmtest.c file.
struct person
{
int id;
int height;
int weight;
};
int main(int argc, char** argv)
{
int fd;
int result;
struct person *tom;
// Open the file. (We saved something to this file in mmtest.c)
fd = open(FILEPATH, O_RDWR);
if (fd == -1)
{
perror("Error opening file for writing");
exit(EXIT_FAILURE);
}
// Now tom is pointing the the memory address that mapped to the file.
tom = (struct person *)mmap(0, sizeof(struct person), PROT_READ, MAP_SHARED, fd, 0);
if (tom == MAP_FAILED)
{
close(fd);
perror("Error mmapping the file");
exit(EXIT_FAILURE);
}
// Read data from tom.
printf("id = %d\n", tom->id);
printf("height = %d\n", tom->height);
printf("weight = %d\n", tom->weight);
// Buy some time for investigation.
sleep(300);
return 0;
}
能顺利读取 id=5, height=180, weight=65 的信息。
id = 5
height = 180
weight = 65
查看内存映射 (maps)
从 /proc/
feichas+ 3819 0.0 0.0 4084 684 pts/0 S+ 10:19 0:00 ./mmtest_read
/tmp$ cat /proc/3819/maps
00400000-00401000 r-xp 00000000 fd:00 13770524 /tmp/mmtest_read
00600000-00601000 rw-p 00000000 fd:00 13770524 /tmp/mmtest_read
7f97c9c1a000-7f97c9dbb000 r-xp 00000000 fd:00 5931530 /lib/x86_64-linux-gnu/libc-2.19.so
7f97c9dbb000-7f97c9fbb000 ---p 001a1000 fd:00 5931530 /lib/x86_64-linux-gnu/libc-2.19.so
7f97c9fbb000-7f97c9fbf000 r--p 001a1000 fd:00 5931530 /lib/x86_64-linux-gnu/libc-2.19.so
7f97c9fbf000-7f97c9fc1000 rw-p 001a5000 fd:00 5931530 /lib/x86_64-linux-gnu/libc-2.19.so
7f97c9fc1000-7f97c9fc5000 rw-p 00000000 00:00 0
7f97c9fc5000-7f97c9fe5000 r-xp 00000000 fd:00 5931526 /lib/x86_64-linux-gnu/ld-2.19.so
7f97ca1c7000-7f97ca1ca000 rw-p 00000000 00:00 0
7f97ca1e1000-7f97ca1e2000 rw-p 00000000 00:00 0
7f97ca1e2000-7f97ca1e3000 r--s 00000000 fd:00 13770511 /tmp/tom.bin
7f97ca1e3000-7f97ca1e5000 rw-p 00000000 00:00 0
7f97ca1e5000-7f97ca1e6000 r--p 00020000 fd:00 5931526 /lib/x86_64-linux-gnu/ld-2.19.so
7f97ca1e6000-7f97ca1e7000 rw-p 00021000 fd:00 5931526 /lib/x86_64-linux-gnu/ld-2.19.so
7f97ca1e7000-7f97ca1e8000 rw-p 00000000 00:00 0
7ffc41521000-7ffc41542000 rw-p 00000000 00:00 0 [stack]
7ffc415c6000-7ffc415c8000 r-xp 00000000 00:00 0 [vdso]
7ffc415c8000-7ffc415ca000 r--p 00000000 00:00 0 [vvar]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
gdb - Cannot access memory at address
如果收取 mmtest_read 的 coredump,可以看到,尽管这个程序能正常访问这段内存,但是在 coredump 中仍然会遇到"Cannot access memory at address" 的问题。这有可能是 coredump 不收集有文件映射的内存数据。(有待查证).
(gdb) bt
#0 0x00007f97c9cd3f10 in __nanosleep_nocancel () at ../sysdeps/unix/syscall-template.S:81
#1 0x00007f97c9cd3dc4 in __sleep (seconds=0) at ../sysdeps/unix/sysv/linux/sleep.c:137
#2 0x0000000000400795 in main (argc=1, argv=0x7ffc4153f668) at mmtest_read.c:55
(gdb) f 2
#2 0x0000000000400795 in main (argc=1, argv=0x7ffc4153f668) at mmtest_read.c:55
(gdb) p tom
$1 = (struct person *) 0x7f97ca1e2000
(gdb) p *tom
Cannot access memory at address 0x7f97ca1e2000
这么说,在 coredump 里看到"Cannot access memory at address" 并不能说明这段地址出现问题。
signal 7 - bus error
在 dovecot 的 coredump 中看到了 signal 7, 如何能触发 signal 7 呢?
嗯,如果 mmap 的文件出现问题,就可能触发问题。比如,清空这个 tom.bin 文件。
/tmp$ ./mmtest_read
Bus error (core dumped)
/tmp$ gdb mmtest_read core
Program terminated with signal SIGBUS, Bus error.
#0 0x0000000000400748 in main (argc=1, argv=0x7ffed46bbdf8) at mmtest_read.c:43
43 printf("id = %d\n", tom->id);
至于那个背景问题
至于 dovecot 的 signal 7 - bus error 问题,经过调查后,有可能是 mmap() 所使用的 dovecot.index.cache 文件存放在 NFS 中,导致了意外。 可以通过在 dovecot 中禁用 mmap 解决:
https://wiki2.dovecot.org/NFS