Staging
v0.8.1
https://github.com/torvalds/linux
Revision b29c701deacd5d24453127c37ed77ef851c53b8b authored by Henry Nestler on 12 May 2008, 13:44:39 UTC, committed by Ingo Molnar on 12 June 2008, 19:26:07 UTC
Page faults in kernel address space between PAGE_OFFSET up to
VMALLOC_START should not try to map as vmalloc.

Fix rarely endless page faults inside mount_block_root for root
filesystem at boot time.

All 32bit kernels up to 2.6.25 can fail into this hole.
I can not present this under native linux kernel. I see, that the 64bit
has fixed the problem. I copied the same lines into 32bit part.

Recorded debugs are from coLinux kernel 2.6.22.18 (virtualisation):
http://www.henrynestler.com/colinux/testing/pfn-check-0.7.3/20080410-antinx/bug16-recursive-page-fault-endless.txt
The physicaly memory was trimmed down to 192MB to better catch the bug.
More memory gets the bug more rarely.

Details, how every x86 32bit system can fail:

Start from "mount_block_root",
http://lxr.linux.no/linux/init/do_mounts.c#L297
There the variable "fs_names" got one memory page with 4096 bytes.
Variable "p" walks through the existing file system types. The first
string is no problem.
But, with the second loop in mount_block_root the offset of "p" is not
at beginning of page, the offset is for example +9, if "reiserfs" is the
first in list.
Than calls do_mount_root, and lands in sys_mount.
Remember: Variable "type_page" contains now "fs_type+9" and not contains
a full page.
The sys_mount copies 4096 bytes with function "exact_copy_from_user()":
http://lxr.linux.no/linux/fs/namespace.c#L1540

Mostly exist pages after the buffer "fs_names+4096+9" and the page fault
handler was not called. No problem.

In the case, if the page after "fs_names+4096" is not mapped, the page
fault handler was called from http://lxr.linux.no/linux/fs/namespace.c#L1320

The do_page_fault gots an address 0xc03b4000.
It's kernel address, address >= TASK_SIZE, but not from vmalloc! It's
from "__getname()" alias "kmem_cache_alloc".
The "error_code" is 0. "vmalloc_fault" will be call:
http://lxr.linux.no/linux/arch/i386/mm/fault.c#L332

"vmalloc_fault" tryed to find the physical page for a non existing
virtual memory area. The macro "pte_present" in vmalloc_fault()
got a next page fault for 0xc0000ed0 at:
http://lxr.linux.no/linux/arch/i386/mm/fault.c#L282

No PTE exist for such virtual address. The page fault handler was trying
to sync the physical page for the PTE lockup.

This called vmalloc_fault() again for address 0xc000000, and that also
was not existing. The endless began...

In normal case the cpu would still loop with disabled interrrupts. Under
coLinux this was catched by a stack overflow inside printk debugs.

Signed-off-by: Henry Nestler <henry.nestler@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
1 parent 3703f39
Raw File
Tip revision: b29c701deacd5d24453127c37ed77ef851c53b8b authored by Henry Nestler on 12 May 2008, 13:44:39 UTC
x86: fix endless page faults in mount_block_root for Linux 2.6
Tip revision: b29c701
blk-ioc.c
/*
 * Functions related to io context handling
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */

#include "blk.h"

/*
 * For io context allocations
 */
static struct kmem_cache *iocontext_cachep;

static void cfq_dtor(struct io_context *ioc)
{
	if (!hlist_empty(&ioc->cic_list)) {
		struct cfq_io_context *cic;

		cic = list_entry(ioc->cic_list.first, struct cfq_io_context,
								cic_list);
		cic->dtor(ioc);
	}
}

/*
 * IO Context helper functions. put_io_context() returns 1 if there are no
 * more users of this io context, 0 otherwise.
 */
int put_io_context(struct io_context *ioc)
{
	if (ioc == NULL)
		return 1;

	BUG_ON(atomic_read(&ioc->refcount) == 0);

	if (atomic_dec_and_test(&ioc->refcount)) {
		rcu_read_lock();
		if (ioc->aic && ioc->aic->dtor)
			ioc->aic->dtor(ioc->aic);
		cfq_dtor(ioc);
		rcu_read_unlock();

		kmem_cache_free(iocontext_cachep, ioc);
		return 1;
	}
	return 0;
}
EXPORT_SYMBOL(put_io_context);

static void cfq_exit(struct io_context *ioc)
{
	rcu_read_lock();

	if (!hlist_empty(&ioc->cic_list)) {
		struct cfq_io_context *cic;

		cic = list_entry(ioc->cic_list.first, struct cfq_io_context,
								cic_list);
		cic->exit(ioc);
	}
	rcu_read_unlock();
}

/* Called by the exitting task */
void exit_io_context(void)
{
	struct io_context *ioc;

	task_lock(current);
	ioc = current->io_context;
	current->io_context = NULL;
	task_unlock(current);

	if (atomic_dec_and_test(&ioc->nr_tasks)) {
		if (ioc->aic && ioc->aic->exit)
			ioc->aic->exit(ioc->aic);
		cfq_exit(ioc);

		put_io_context(ioc);
	}
}

struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
{
	struct io_context *ret;

	ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
	if (ret) {
		atomic_set(&ret->refcount, 1);
		atomic_set(&ret->nr_tasks, 1);
		spin_lock_init(&ret->lock);
		ret->ioprio_changed = 0;
		ret->ioprio = 0;
		ret->last_waited = jiffies; /* doesn't matter... */
		ret->nr_batch_requests = 0; /* because this is 0 */
		ret->aic = NULL;
		INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
		INIT_HLIST_HEAD(&ret->cic_list);
		ret->ioc_data = NULL;
	}

	return ret;
}

/*
 * If the current task has no IO context then create one and initialise it.
 * Otherwise, return its existing IO context.
 *
 * This returned IO context doesn't have a specifically elevated refcount,
 * but since the current task itself holds a reference, the context can be
 * used in general code, so long as it stays within `current` context.
 */
struct io_context *current_io_context(gfp_t gfp_flags, int node)
{
	struct task_struct *tsk = current;
	struct io_context *ret;

	ret = tsk->io_context;
	if (likely(ret))
		return ret;

	ret = alloc_io_context(gfp_flags, node);
	if (ret) {
		/* make sure set_task_ioprio() sees the settings above */
		smp_wmb();
		tsk->io_context = ret;
	}

	return ret;
}

/*
 * If the current task has no IO context then create one and initialise it.
 * If it does have a context, take a ref on it.
 *
 * This is always called in the context of the task which submitted the I/O.
 */
struct io_context *get_io_context(gfp_t gfp_flags, int node)
{
	struct io_context *ret = NULL;

	/*
	 * Check for unlikely race with exiting task. ioc ref count is
	 * zero when ioc is being detached.
	 */
	do {
		ret = current_io_context(gfp_flags, node);
		if (unlikely(!ret))
			break;
	} while (!atomic_inc_not_zero(&ret->refcount));

	return ret;
}
EXPORT_SYMBOL(get_io_context);

void copy_io_context(struct io_context **pdst, struct io_context **psrc)
{
	struct io_context *src = *psrc;
	struct io_context *dst = *pdst;

	if (src) {
		BUG_ON(atomic_read(&src->refcount) == 0);
		atomic_inc(&src->refcount);
		put_io_context(dst);
		*pdst = src;
	}
}
EXPORT_SYMBOL(copy_io_context);

static int __init blk_ioc_init(void)
{
	iocontext_cachep = kmem_cache_create("blkdev_ioc",
			sizeof(struct io_context), 0, SLAB_PANIC, NULL);
	return 0;
}
subsys_initcall(blk_ioc_init);
back to top