Staging
v0.5.1
v0.5.1
https://github.com/torvalds/linux
Revision 5b41e74ad1b0bf7bc51765ae74e5dc564afc3e48 authored by Dmitri Monakhov on 28 March 2008, 21:15:52 UTC, committed by Linus Torvalds on 28 March 2008, 21:45:21 UTC
Current nobh_write_end() implementation ignore partial writes(copied < len) case if page was fully mapped and simply mark page as Uptodate, which is totally wrong because area [pos+copied, pos+len) wasn't updated explicitly in previous write_begin call. It simply contains garbage from pagecache and result in data leakage. #TEST_CASE_BEGIN: ~~~~~~~~~~~~~~~~ In fact issue triggered by classical testcase open("/mnt/test", O_RDWR|O_CREAT|O_TRUNC, 0666) = 3 ftruncate(3, 409600) = 0 writev(3, [{"a", 1}, {NULL, 4095}], 2) = 1 ##TESTCASE_SOURCE: ~~~~~~~~~~~~~~~~~ #include <stdio.h> #include <stdlib.h> #include <fcntl.h> #include <sys/uio.h> #include <sys/mman.h> #include <errno.h> int main(int argc, char **argv) { int fd, ret; void* p; struct iovec iov[2]; fd = open(argv[1], O_RDWR|O_CREAT|O_TRUNC, 0666); ftruncate(fd, 409600); iov[0].iov_base="a"; iov[0].iov_len=1; iov[1].iov_base=NULL; iov[1].iov_len=4096; ret = writev(fd, iov, sizeof(iov)/sizeof(struct iovec)); printf("writev = %d, err = %d\n", ret, errno); return 0; } ##TESTCASE RESULT: ~~~~~~~~~~~~~~~~~~ [root@ts63 ~]# mount | grep mnt2 /dev/mapper/test on /mnt2 type ext2 (rw,nobh) [root@ts63 ~]# /tmp/writev /mnt2/test writev = 1, err = 0 [root@ts63 ~]# hexdump -C /mnt2/test 00000000 61 65 62 6f 6f 74 00 00 f0 b9 b4 59 3a 00 00 00 |aeboot.....Y:...| 00000010 20 00 00 00 00 00 00 00 21 00 00 00 00 00 00 00 | .......!.......| 00000020 df df df df df df df df df df df df df df df df |................| 00000030 3a 00 00 00 2a 00 00 00 21 00 00 00 00 00 00 00 |:...*...!.......| 00000040 60 c0 8c 00 00 00 00 00 40 4a 8d 00 00 00 00 00 |`.......@J......| 00000050 00 00 00 00 00 00 00 00 41 00 00 00 00 00 00 00 |........A.......| 00000060 74 69 6d 65 20 64 64 20 69 66 3d 2f 64 65 76 2f |time dd if=/dev/| 00000070 6c 6f 6f 70 30 20 20 6f 66 3d 2f 64 65 76 2f 6e |loop0 of=/dev/n| skip.. 00000f50 00 00 00 00 00 00 00 00 31 00 00 00 00 00 00 00 |........1.......| 00000f60 6d 6b 66 73 2e 65 78 74 33 20 2f 64 65 76 2f 76 |mkfs.ext3 /dev/v| 00000f70 7a 76 67 2f 74 65 73 74 20 2d 62 34 30 39 36 00 |zvg/test -b4096.| 00000f80 a0 fe 8c 00 00 00 00 00 21 00 00 00 00 00 00 00 |........!.......| 00000f90 23 31 32 30 35 39 35 30 34 30 34 00 3a 00 00 00 |#1205950404.:...| 00000fa0 20 00 8d 00 00 00 00 00 21 00 00 00 00 00 00 00 | .......!.......| 00000fb0 d0 cf 8c 00 00 00 00 00 10 d0 8c 00 00 00 00 00 |................| 00000fc0 00 00 00 00 00 00 00 00 41 00 00 00 00 00 00 00 |........A.......| 00000fd0 6d 6f 75 6e 74 20 2f 64 65 76 2f 76 7a 76 67 2f |mount /dev/vzvg/| 00000fe0 74 65 73 74 20 20 2f 76 7a 20 2d 6f 20 64 61 74 |test /vz -o dat| 00000ff0 61 3d 77 72 69 74 65 62 61 63 6b 00 00 00 00 00 |a=writeback.....| 00001000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| As you can see file's page contains garbage from pagecache instead of zeros. #TEST_CASE_END Attached patch: - Add sanity check BUG_ON in order to prevent incorrect usage by caller, This is function invariant because page can has buffers and in no zero *fadata pointer at the same time. - Always attach buffers to page is it is partial write case. - Always switch back to generic_write_end if page has buffers. This is reasonable because if page already has buffer then generic_write_begin was called previously. Signed-off-by: Dmitri Monakhov <dmonakhov@openvz.org> Reviewed-by: Nick Piggin <npiggin@suse.de> Cc: <stable@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 1d4a788
Tip revision: 5b41e74ad1b0bf7bc51765ae74e5dc564afc3e48 authored by Dmitri Monakhov on 28 March 2008, 21:15:52 UTC
vfs: fix data leak in nobh_write_end()
vfs: fix data leak in nobh_write_end()
Tip revision: 5b41e74
do_mounts_md.c
#include <linux/raid/md.h>
#include "do_mounts.h"
/*
* When md (and any require personalities) are compiled into the kernel
* (not a module), arrays can be assembles are boot time using with AUTODETECT
* where specially marked partitions are registered with md_autodetect_dev(),
* and with MD_BOOT where devices to be collected are given on the boot line
* with md=.....
* The code for that is here.
*/
static int __initdata raid_noautodetect, raid_autopart;
static struct {
int minor;
int partitioned;
int level;
int chunk;
char *device_names;
} md_setup_args[256] __initdata;
static int md_setup_ents __initdata;
extern int mdp_major;
/*
* Parse the command-line parameters given our kernel, but do not
* actually try to invoke the MD device now; that is handled by
* md_setup_drive after the low-level disk drivers have initialised.
*
* 27/11/1999: Fixed to work correctly with the 2.3 kernel (which
* assigns the task of parsing integer arguments to the
* invoked program now). Added ability to initialise all
* the MD devices (by specifying multiple "md=" lines)
* instead of just one. -- KTK
* 18May2000: Added support for persistent-superblock arrays:
* md=n,0,factor,fault,device-list uses RAID0 for device n
* md=n,-1,factor,fault,device-list uses LINEAR for device n
* md=n,device-list reads a RAID superblock from the devices
* elements in device-list are read by name_to_kdev_t so can be
* a hex number or something like /dev/hda1 /dev/sdb
* 2001-06-03: Dave Cinege <dcinege@psychosis.com>
* Shifted name_to_kdev_t() and related operations to md_set_drive()
* for later execution. Rewrote section to make devfs compatible.
*/
static int __init md_setup(char *str)
{
int minor, level, factor, fault, partitioned = 0;
char *pername = "";
char *str1;
int ent;
if (*str == 'd') {
partitioned = 1;
str++;
}
if (get_option(&str, &minor) != 2) { /* MD Number */
printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
return 0;
}
str1 = str;
for (ent=0 ; ent< md_setup_ents ; ent++)
if (md_setup_args[ent].minor == minor &&
md_setup_args[ent].partitioned == partitioned) {
printk(KERN_WARNING "md: md=%s%d, Specified more than once. "
"Replacing previous definition.\n", partitioned?"d":"", minor);
break;
}
if (ent >= ARRAY_SIZE(md_setup_args)) {
printk(KERN_WARNING "md: md=%s%d - too many md initialisations\n", partitioned?"d":"", minor);
return 0;
}
if (ent >= md_setup_ents)
md_setup_ents++;
switch (get_option(&str, &level)) { /* RAID level */
case 2: /* could be 0 or -1.. */
if (level == 0 || level == LEVEL_LINEAR) {
if (get_option(&str, &factor) != 2 || /* Chunk Size */
get_option(&str, &fault) != 2) {
printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
return 0;
}
md_setup_args[ent].level = level;
md_setup_args[ent].chunk = 1 << (factor+12);
if (level == LEVEL_LINEAR)
pername = "linear";
else
pername = "raid0";
break;
}
/* FALL THROUGH */
case 1: /* the first device is numeric */
str = str1;
/* FALL THROUGH */
case 0:
md_setup_args[ent].level = LEVEL_NONE;
pername="super-block";
}
printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n",
minor, pername, str);
md_setup_args[ent].device_names = str;
md_setup_args[ent].partitioned = partitioned;
md_setup_args[ent].minor = minor;
return 1;
}
#define MdpMinorShift 6
static void __init md_setup_drive(void)
{
int minor, i, ent, partitioned;
dev_t dev;
dev_t devices[MD_SB_DISKS+1];
for (ent = 0; ent < md_setup_ents ; ent++) {
int fd;
int err = 0;
char *devname;
mdu_disk_info_t dinfo;
char name[16];
minor = md_setup_args[ent].minor;
partitioned = md_setup_args[ent].partitioned;
devname = md_setup_args[ent].device_names;
sprintf(name, "/dev/md%s%d", partitioned?"_d":"", minor);
if (partitioned)
dev = MKDEV(mdp_major, minor << MdpMinorShift);
else
dev = MKDEV(MD_MAJOR, minor);
create_dev(name, dev);
for (i = 0; i < MD_SB_DISKS && devname != 0; i++) {
char *p;
char comp_name[64];
u32 rdev;
p = strchr(devname, ',');
if (p)
*p++ = 0;
dev = name_to_dev_t(devname);
if (strncmp(devname, "/dev/", 5) == 0)
devname += 5;
snprintf(comp_name, 63, "/dev/%s", devname);
rdev = bstat(comp_name);
if (rdev)
dev = new_decode_dev(rdev);
if (!dev) {
printk(KERN_WARNING "md: Unknown device name: %s\n", devname);
break;
}
devices[i] = dev;
devname = p;
}
devices[i] = 0;
if (!i)
continue;
printk(KERN_INFO "md: Loading md%s%d: %s\n",
partitioned ? "_d" : "", minor,
md_setup_args[ent].device_names);
fd = sys_open(name, 0, 0);
if (fd < 0) {
printk(KERN_ERR "md: open failed - cannot start "
"array %s\n", name);
continue;
}
if (sys_ioctl(fd, SET_ARRAY_INFO, 0) == -EBUSY) {
printk(KERN_WARNING
"md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n",
minor);
sys_close(fd);
continue;
}
if (md_setup_args[ent].level != LEVEL_NONE) {
/* non-persistent */
mdu_array_info_t ainfo;
ainfo.level = md_setup_args[ent].level;
ainfo.size = 0;
ainfo.nr_disks =0;
ainfo.raid_disks =0;
while (devices[ainfo.raid_disks])
ainfo.raid_disks++;
ainfo.md_minor =minor;
ainfo.not_persistent = 1;
ainfo.state = (1 << MD_SB_CLEAN);
ainfo.layout = 0;
ainfo.chunk_size = md_setup_args[ent].chunk;
err = sys_ioctl(fd, SET_ARRAY_INFO, (long)&ainfo);
for (i = 0; !err && i <= MD_SB_DISKS; i++) {
dev = devices[i];
if (!dev)
break;
dinfo.number = i;
dinfo.raid_disk = i;
dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC);
dinfo.major = MAJOR(dev);
dinfo.minor = MINOR(dev);
err = sys_ioctl(fd, ADD_NEW_DISK, (long)&dinfo);
}
} else {
/* persistent */
for (i = 0; i <= MD_SB_DISKS; i++) {
dev = devices[i];
if (!dev)
break;
dinfo.major = MAJOR(dev);
dinfo.minor = MINOR(dev);
sys_ioctl(fd, ADD_NEW_DISK, (long)&dinfo);
}
}
if (!err)
err = sys_ioctl(fd, RUN_ARRAY, 0);
if (err)
printk(KERN_WARNING "md: starting md%d failed\n", minor);
else {
/* reread the partition table.
* I (neilb) and not sure why this is needed, but I cannot
* boot a kernel with devfs compiled in from partitioned md
* array without it
*/
sys_close(fd);
fd = sys_open(name, 0, 0);
sys_ioctl(fd, BLKRRPART, 0);
}
sys_close(fd);
}
}
static int __init raid_setup(char *str)
{
int len, pos;
len = strlen(str) + 1;
pos = 0;
while (pos < len) {
char *comma = strchr(str+pos, ',');
int wlen;
if (comma)
wlen = (comma-str)-pos;
else wlen = (len-1)-pos;
if (!strncmp(str, "noautodetect", wlen))
raid_noautodetect = 1;
if (strncmp(str, "partitionable", wlen)==0)
raid_autopart = 1;
if (strncmp(str, "part", wlen)==0)
raid_autopart = 1;
pos += wlen+1;
}
return 1;
}
__setup("raid=", raid_setup);
__setup("md=", md_setup);
void __init md_run_setup(void)
{
create_dev("/dev/md0", MKDEV(MD_MAJOR, 0));
if (raid_noautodetect)
printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n");
else {
int fd = sys_open("/dev/md0", 0, 0);
if (fd >= 0) {
sys_ioctl(fd, RAID_AUTORUN, raid_autopart);
sys_close(fd);
}
}
md_setup_drive();
}
Computing file changes ...