diff -r 1e225598ce82 linux-2.6-xen-sparse/arch/i386/kernel/cpu/common-xen.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/cpu/common-xen.c Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/cpu/common-xen.c Tue Oct 10 07:00:16 2006 -0700 @@ -568,7 +568,8 @@ void __init early_cpu_init(void) #endif } -void __cpuinit cpu_gdt_init(struct Xgt_desc_struct *gdt_descr) +/* cannot be __cpuinit when used by self checkpointer */ +void cpu_gdt_init(struct Xgt_desc_struct *gdt_descr) { unsigned long frames[16]; unsigned long va; @@ -584,6 +585,42 @@ void __cpuinit cpu_gdt_init(struct Xgt_d if (HYPERVISOR_set_gdt(frames, gdt_descr->size / 8)) BUG(); } + +#ifdef CONFIG_XEN_SELF_CHECKPOINTING +#include + +void cpu_reinit(void) +{ + struct task_struct* tsk = current; + struct thread_struct *thread = &tsk->thread; + int cpu = smp_processor_id(); + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + + cpu_gdt_init(cpu_gdt_descr); + + if (cpu_has_vme || cpu_has_de) + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0); + + /* Clear %fs and %gs. */ + asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); + + /* Clear all 6 debug registers: */ + set_debugreg(0, 0); + set_debugreg(0, 1); + set_debugreg(0, 2); + set_debugreg(0, 3); + set_debugreg(0, 6); + set_debugreg(0, 7); + + + load_LDT(¤t->active_mm->context); + + /* TODO set FPU stuff */ +} +#endif + /* * cpu_init() initializes state that is per-CPU. Some data is already diff -r 1e225598ce82 linux-2.6-xen-sparse/arch/i386/kernel/entry-xen.S --- a/linux-2.6-xen-sparse/arch/i386/kernel/entry-xen.S Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/entry-xen.S Tue Oct 10 07:00:16 2006 -0700 @@ -859,6 +859,12 @@ KPROBE_ENTRY(page_fault) jmp error_code .previous .text +#ifdef CONFIG_XEN_SELF_CHECKPOINTING +ENTRY(checkpoint_page_fault) + pushl $do_checkpoint_page_fault + jmp error_code +#endif + #ifdef CONFIG_X86_MCE ENTRY(machine_check) pushl $0 diff -r 1e225598ce82 linux-2.6-xen-sparse/arch/i386/kernel/process-xen.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/process-xen.c Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/process-xen.c Tue Oct 10 07:00:16 2006 -0700 @@ -541,6 +541,16 @@ struct task_struct fastcall * __switch_t else BUG_ON(!(read_cr0() & 8)); #endif +#ifdef CONFIG_XEN_SELF_CHECKPOINTING + /* if self-migrating, we need to touch the new esp0 to make sure it will be + * writable by Xen */ + + if(next->esp0) + { + asm volatile("addl $0x0, -4(%%eax)" : : "eax"(next->esp0)); + } +#endif + /* * Reload esp0. * This is load_esp0(tss, next) with a multicall. diff -r 1e225598ce82 linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c Tue Oct 10 07:00:16 2006 -0700 @@ -1859,6 +1859,7 @@ void __init setup_arch(char **cmdline_p) panic("Xen granted us console access " "but not privileged status"); + #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) if (!efi_enabled || diff -r 1e225598ce82 linux-2.6-xen-sparse/arch/i386/kernel/traps-xen.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/traps-xen.c Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/traps-xen.c Tue Oct 10 07:00:16 2006 -0700 @@ -1043,6 +1043,19 @@ static trap_info_t trap_table[] = { { 0, 0, 0, 0 } }; +extern void hypervisor_callback(void); +extern void failsafe_callback(void); + +extern void cpu_reinit(void); + +void xen_trap_init(void) +{ + HYPERVISOR_set_callbacks( + __KERNEL_CS, (unsigned long)hypervisor_callback, + __KERNEL_CS, (unsigned long)failsafe_callback); + HYPERVISOR_set_trap_table(trap_table); +} + void __init trap_init(void) { HYPERVISOR_set_trap_table(trap_table); diff -r 1e225598ce82 linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c --- a/linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c Tue Oct 10 07:00:16 2006 -0700 @@ -44,6 +44,9 @@ #include #include +#ifdef CONFIG_XEN_SELF_CHECKPOINTING +#include +#endif #ifdef CONFIG_X86_64 #define pmd_val_ma(v) (v).pmd #else @@ -58,14 +61,26 @@ void xen_l1_entry_update(pte_t *ptr, pte void xen_l1_entry_update(pte_t *ptr, pte_t val) { mmu_update_t u; +#ifdef CONFIG_XEN_SELF_CHECKPOINTING + BUG(); /* trivial to fix, just make sure this is not called */ +#endif u.ptr = virt_to_machine(ptr); u.val = pte_val_ma(val); BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); } +extern void perhaps_flush_wrpt(void); void xen_l2_entry_update(pmd_t *ptr, pmd_t val) { mmu_update_t u; +#ifdef CONFIG_XEN_SELF_CHECKPOINTING + if(unlikely(is_migrating())) + { + perhaps_flush_wrpt(); + extern void set_dirty_checkpoint_pfn(unsigned long); + set_dirty_checkpoint_pfn(__pa(ptr)>>PAGE_SHIFT); + } +#endif u.ptr = virt_to_machine(ptr); u.val = pmd_val_ma(val); BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); @@ -135,6 +150,7 @@ void xen_tlb_flush(void) } EXPORT_SYMBOL(xen_tlb_flush); +//EXPORT_SYMBOL(xen_invlpg); void xen_invlpg(unsigned long ptr) { struct mmuext_op op; diff -r 1e225598ce82 linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c --- a/linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c Tue Oct 10 07:00:16 2006 -0700 @@ -29,7 +29,18 @@ #include #include + static void pgd_test_and_unpin(pgd_t *pgd); + +#ifdef CONFIG_XEN_SELF_CHECKPOINTING +#include +void do_set_pte(pte_t* pteptr, pte_t pteval) +{ + *pteptr = pteval; +} + +void (*set_pte) (pte_t* pteptr, pte_t pteval) = do_set_pte; +#endif void show_mem(void) { diff -r 1e225598ce82 linux-2.6-xen-sparse/drivers/xen/Kconfig --- a/linux-2.6-xen-sparse/drivers/xen/Kconfig Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/drivers/xen/Kconfig Tue Oct 10 07:00:16 2006 -0700 @@ -44,6 +44,13 @@ config XEN_PCIDEV_BACKEND PCI devices to other guests. If you select this to be a module, you will need to make sure no other driver has bound to the device(s) you want to make visible to other guests. + +config XEN_SELF_CHECKPOINTING + bool "Support /dev/checkpoint for self checkpoint or migration" + default y + help + This gives a user-space process the ability to write a checkpoint + of the whole system. choice prompt "PCI Backend Mode" diff -r 1e225598ce82 linux-2.6-xen-sparse/drivers/xen/Makefile --- a/linux-2.6-xen-sparse/drivers/xen/Makefile Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/drivers/xen/Makefile Tue Oct 10 07:00:16 2006 -0700 @@ -20,3 +20,4 @@ obj-$(CONFIG_XEN_PCIDEV_BACKEND) += pcib obj-$(CONFIG_XEN_PCIDEV_BACKEND) += pciback/ obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += pcifront/ +obj-$(CONFIG_XEN_SELF_CHECKPOINTING)+= migrate/ diff -r 1e225598ce82 linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c Tue Oct 10 07:00:16 2006 -0700 @@ -803,6 +803,10 @@ static void blkif_recover(struct blkfron /* ** Driver Registration ** */ +static int blkfront_reset(struct xenbus_device *dev) +{ + return 0; +} static struct xenbus_device_id blkfront_ids[] = { @@ -819,6 +823,7 @@ static struct xenbus_driver blkfront = { .remove = blkfront_remove, .resume = blkfront_resume, .otherend_changed = backend_changed, + .reset = blkfront_reset, }; diff -r 1e225598ce82 linux-2.6-xen-sparse/drivers/xen/core/gnttab.c --- a/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c Tue Oct 10 07:00:16 2006 -0700 @@ -42,6 +42,10 @@ #include #include #include + +#ifdef CONFIG_XEN_SELF_CHECKPOINTING +#include +#endif #if 1 #define ASSERT(_p) \ @@ -191,6 +195,10 @@ gnttab_query_foreign_access(grant_ref_t return (nflags & (GTF_reading|GTF_writing)); } +#ifdef CONFIG_XEN_SELF_CHECKPOINTING +extern void set_dirty_checkpoint_mfn(unsigned long); +#endif + int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) { @@ -204,6 +212,13 @@ gnttab_end_foreign_access_ref(grant_ref_ } } while ((nflags = synch_cmpxchg(&shared[ref].flags, flags, 0)) != flags); + +#ifdef CONFIG_XEN_SELF_CHECKPOINTING + if(unlikely(is_migrating())) + { + set_dirty_checkpoint_mfn(shared[ref].frame); + } +#endif return 1; } diff -r 1e225598ce82 linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c --- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c Tue Oct 10 07:00:16 2006 -0700 @@ -63,6 +63,10 @@ #include #include +#ifdef CONFIG_XEN_SELF_CHECKPOINTING +#include +#endif + #define GRANT_INVALID_REF 0 #define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE) @@ -455,6 +459,22 @@ static int network_open(struct net_devic return 0; } + +/* actually, I would like to do this for all pages getting freed in the system, +* * as there is no reason for checkpointing pages that noone references +* */ + +#ifdef CONFIG_XEN_SELF_CHECKPOINTING +static void destructor(struct sk_buff* skb) +{ + if(is_migrating()==1) + { + unsigned long pfn = virt_to_phys(skb->head)>>PAGE_SHIFT; + clear_bit(pfn, migration_page->dirty_bitmap); + } +} +#endif + static void network_tx_buf_gc(struct net_device *dev) { @@ -486,6 +506,11 @@ static void network_tx_buf_gc(struct net &np->gref_tx_head, np->grant_tx_ref[id]); np->grant_tx_ref[id] = GRANT_INVALID_REF; add_id_to_freelist(np->tx_skbs, id); + +#ifdef CONFIG_XEN_SELF_CHECKPOINTING + if(!skb->destructor) skb->destructor=destructor; +#endif + dev_kfree_skb_irq(skb); } @@ -595,6 +620,15 @@ static void network_alloc_rx_buffers(str /* Remove this page before passing back to Xen. */ set_phys_to_machine(__pa(skb->head) >> PAGE_SHIFT, INVALID_P2M_ENTRY); + +#ifdef CONFIG_XEN_SELF_CHECKPOINTING + if(unlikely(is_migrating())) + { + extern void set_dirty_va_mapping(unsigned long); + set_dirty_va_mapping( (unsigned long) skb->head ); + } +#endif + MULTI_update_va_mapping(np->rx_mcl+i, (unsigned long)skb->head, __pte(0), 0); @@ -846,6 +880,16 @@ static int netif_poll(struct net_device __skb_queue_tail(&rxq, skb); } + +#ifdef CONFIG_XEN_SELF_CHECKPOINTING + if(unlikely(is_migrating())) + { + extern void set_dirty_va_mapping(unsigned long); + extern void set_dirty_checkpoint_pfn(unsigned long); + set_dirty_checkpoint_pfn( __pa(skb->head)>>PAGE_SHIFT ); + set_dirty_va_mapping( (unsigned long) skb->head ); + } +#endif /* Some pages are no longer absent... */ balloon_update_driver_allowance(-work_done); @@ -1311,6 +1355,11 @@ static struct xenbus_device_id netfront_ { "vif" }, { "" } }; +static int netfront_reset(struct xenbus_device *dev) +{ + return 0; +} + static struct xenbus_driver netfront = { @@ -1321,6 +1370,7 @@ static struct xenbus_driver netfront = { .remove = netfront_remove, .resume = netfront_resume, .otherend_changed = backend_changed, + .reset = netfront_reset, }; diff -r 1e225598ce82 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.c --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.c Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.c Tue Oct 10 07:00:16 2006 -0700 @@ -186,6 +186,11 @@ int xb_read(void *data, unsigned len) return 0; } +void xb_reset(void) +{ + xenbus_irq = 0; +} + /* Set up interrupt handler off store event channel. */ int xb_init_comms(void) { diff -r 1e225598ce82 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c Tue Oct 10 07:00:16 2006 -0700 @@ -776,6 +776,23 @@ static struct xenbus_watch be_watch = { .callback = backend_changed, }; +static int reset_dev(struct device *dev, void *data) +{ + int err = 0; + struct xenbus_driver *drv; + struct xenbus_device *xdev; + + if (dev->driver == NULL) + return 0; + drv = to_xenbus_driver(dev->driver); + xdev = container_of(dev, struct xenbus_device, dev); + if (drv->reset) + err = drv->reset(xdev); + if (err) + printk("xenbus: reset %s failed: %i\n", dev->bus_id, err); + return 0; +} + static int suspend_dev(struct device *dev, void *data) { int err = 0; @@ -833,6 +850,14 @@ static int resume_dev(struct device *dev printk(KERN_WARNING "xenbus: resume %s failed: %i\n", dev->bus_id, err); return err; +} + +void xenbus_reset(void) +{ + bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, reset_dev); + bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, reset_dev); + xb_reset(); + xs_reset(); } void xenbus_suspend(void) diff -r 1e225598ce82 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c Tue Oct 10 07:00:16 2006 -0700 @@ -602,8 +602,7 @@ int register_xenbus_watch(struct xenbus_ down_read(&xs_state.suspend_mutex); spin_lock(&watches_lock); - BUG_ON(find_watch(token)); - list_add(&watch->list, &watches); + if(!find_watch(token)) list_add(&watch->list, &watches); spin_unlock(&watches_lock); err = xs_watch(watch->node, token); @@ -662,6 +661,13 @@ void unregister_xenbus_watch(struct xenb } } EXPORT_SYMBOL_GPL(unregister_xenbus_watch); + +void xs_reset(void) +{ + mutex_init(&xs_state.request_mutex); + init_rwsem(&xs_state.suspend_mutex); + xs_suspend(); +} void xs_suspend(void) { diff -r 1e225598ce82 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/fixmap.h --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/fixmap.h Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/fixmap.h Tue Oct 10 07:00:16 2006 -0700 @@ -84,6 +84,8 @@ enum fixed_addresses { FIX_PCIE_MCFG, #endif FIX_SHARED_INFO, + FIX_CHECKPOINT_TMP, + FIX_CHECKPOINT_TMP2, #define NR_FIX_ISAMAPS 256 FIX_ISAMAP_END, FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1, diff -r 1e225598ce82 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/hypercall.h Tue Oct 10 07:00:16 2006 -0700 @@ -40,6 +40,13 @@ #define __STR(x) #x #define STR(x) __STR(x) +#ifdef CONFIG_XEN_SELF_CHECKPOINTING +extern void update_va_prefix(unsigned long va, pte_t* pteval); +#else +#define update_va_prefix(a,b) +#endif + + #define _hypercall0(type, name) \ ({ \ long __res; \ @@ -236,6 +243,8 @@ HYPERVISOR_update_va_mapping( unsigned long va, pte_t new_val, unsigned long flags) { unsigned long pte_hi = 0; + update_va_prefix(va,&new_val); + #ifdef CONFIG_X86_PAE pte_hi = new_val.pte_high; #endif diff -r 1e225598ce82 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/mmu_context.h --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/mmu_context.h Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/mmu_context.h Tue Oct 10 07:00:16 2006 -0700 @@ -21,12 +21,32 @@ static inline void enter_lazy_tlb(struct if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY; #endif +#ifdef CONFIG_XEN_SELF_CHECKPOINTING + // why?? checkpoint_unmap_mm(mm); + //checkpoint_unmap_mm(mm); + if(tsk) + { if(tsk->thread.esp) + { + asm volatile("addl $0x0, -4(%%eax)" : : "eax"(tsk->thread.esp)); + } + + if(tsk->thread.esp0) + { + asm volatile("addl $0x0, -4(%%eax)" : : "eax"(tsk->thread.esp0)); + }} +#endif } -#define prepare_arch_switch(next) __prepare_arch_switch() +extern void checkpoint_switch_mm(struct mm_struct*,struct mm_struct*); +#define prepare_arch_switch(next) __prepare_arch_switch(next) +//#define prepare_arch_switch(next) __prepare_arch_switch(next) -static inline void __prepare_arch_switch(void) +static inline void __prepare_arch_switch(task_t* next) { + +#ifdef CONFIG_XEN_SELF_CHECKPOINTING +// if(next && next->mm) checkpoint_switch_mm(NULL,next->mm); +#endif /* * Save away %fs and %gs. No need to save %es and %ds, as those * are always kernel segments while inside the kernel. Must @@ -61,6 +81,18 @@ static inline void switch_mm(struct mm_s #endif cpu_set(cpu, next->cpu_vm_mask); +#ifdef CONFIG_XEN_SELF_CHECKPOINTING + if(tsk) { + if(tsk->thread.esp) + { + asm volatile("addl $0x0, -4(%%eax)" : : "eax"(tsk->thread.esp)); + } + + if(tsk->thread.esp0) + { + asm volatile("addl $0x0, -4(%%eax)" : : "eax"(tsk->thread.esp0)); + }} +#endif /* Re-load page tables: load_cr3(next->pgd) */ op->cmd = MMUEXT_NEW_BASEPTR; op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT); diff -r 1e225598ce82 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/page.h --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/page.h Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/page.h Tue Oct 10 07:00:16 2006 -0700 @@ -146,12 +146,19 @@ static inline unsigned long mfn_to_local return pfn; } +#ifdef CONFIG_XEN_SELF_CHECKPOINTING +extern void set_p2m_prefix(unsigned long,unsigned long); +#endif + static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn) { if (xen_feature(XENFEAT_auto_translated_physmap)) { BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); return; } +#ifdef CONFIG_XEN_SELF_CHECKPOINTING + set_p2m_prefix(pfn,mfn); +#endif phys_to_machine_mapping[pfn] = mfn; } diff -r 1e225598ce82 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-2level.h --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-2level.h Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-2level.h Tue Oct 10 07:00:16 2006 -0700 @@ -13,7 +13,11 @@ * within a page table are directly modified. Thus, the following * hook is made available. */ +#ifdef CONFIG_XEN_SELF_CHECKPOINTING +extern void (*set_pte) (pte_t* pteptr, pte_t pteval); +#else #define set_pte(pteptr, pteval) (*(pteptr) = pteval) +#endif #define set_pte_at(_mm,addr,ptep,pteval) do { \ if (((_mm) != current->mm && (_mm) != &init_mm) || \ diff -r 1e225598ce82 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable.h --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable.h Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable.h Tue Oct 10 07:00:16 2006 -0700 @@ -118,7 +118,7 @@ void paging_init(void); #define _PAGE_PSE 0x080 /* 4 MB (or 2MB) page, Pentium+, if present.. */ #define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */ #define _PAGE_UNUSED1 0x200 /* available for programmer */ -#define _PAGE_UNUSED2 0x400 +#define _PAGE_CHKPOINT 0x400 #define _PAGE_UNUSED3 0x800 /* If _PAGE_PRESENT is clear, we use these: */ @@ -225,7 +225,13 @@ static inline int pte_read(pte_t pte) { static inline int pte_read(pte_t pte) { return (pte).pte_low & _PAGE_USER; } static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; } +#ifdef CONFIG_XEN_SELF_CHECKPOINTING +static inline int pte_write(pte_t pte) { return (pte).pte_low & (_PAGE_RW|_PAGE_CHKPOINT); } +static inline int pte_chkpoint(pte_t pte) { return (pte).pte_low & _PAGE_CHKPOINT; } +#else static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; } +#endif + static inline int pte_huge(pte_t pte) { return ((pte).pte_low & __LARGE_PTE) == __LARGE_PTE; } /* diff -r 1e225598ce82 linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/processor.h --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/processor.h Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/processor.h Tue Oct 10 07:00:16 2006 -0700 @@ -497,6 +497,20 @@ static inline void __load_esp0(struct ts static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread) { tss->esp0 = thread->esp0; +#ifdef CONFIG_XEN_SELF_CHECKPOINTING + if(thread) + { + if(thread->esp) + { + asm volatile("addl $0x0, -4(%%eax)" : : "eax"(thread->esp)); + } + + if(thread->esp0) + { + asm volatile("addl $0x0, -4(%%eax)" : : "eax"(thread->esp0)); + } + } +#endif #ifdef CONFIG_X86_SYSENTER /* This can only happen when SEP is enabled, no need to test "SEP"arately */ if (unlikely(tss->ss1 != thread->sysenter_cs)) { diff -r 1e225598ce82 linux-2.6-xen-sparse/include/xen/xenbus.h --- a/linux-2.6-xen-sparse/include/xen/xenbus.h Thu Jun 29 10:51:49 2006 +0100 +++ b/linux-2.6-xen-sparse/include/xen/xenbus.h Tue Oct 10 07:00:16 2006 -0700 @@ -102,6 +102,7 @@ struct xenbus_driver { int (*remove)(struct xenbus_device *dev); int (*suspend)(struct xenbus_device *dev); int (*resume)(struct xenbus_device *dev); + int (*reset)(struct xenbus_device *dev); int (*uevent)(struct xenbus_device *, char **, int, char *, int); struct device_driver driver; int (*read_otherend_details)(struct xenbus_device *dev); diff -r 1e225598ce82 linux-2.6-xen-sparse/drivers/xen/migrate/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/linux-2.6-xen-sparse/drivers/xen/migrate/Makefile Tue Oct 10 07:00:16 2006 -0700 @@ -0,0 +1,4 @@ + +obj-y += migrate.o + + diff -r 1e225598ce82 linux-2.6-xen-sparse/drivers/xen/migrate/migrate.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/linux-2.6-xen-sparse/drivers/xen/migrate/migrate.c Tue Oct 10 07:00:16 2006 -0700 @@ -0,0 +1,1702 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) || \ +defined(CONFIG_PREEMPT_VOLUNTARY) || defined(CONFIG_HIGHMEM4G) || \ +defined(CONFIG_HIGHMEM64G) +#error "self checkpointing does not support SMP, PREEMPT, or HIGHMEM :-( yet" +#endif + +#define BITMAP_SIZE ((max_low_pfn+32767)>>(3+12)) +#define BITMAP_SIZE_BYTES (BITMAP_SIZE*PAGE_SIZE) + +#define xxprintk(msg) HYPERVISOR_console_io(CONSOLEIO_write, strlen(msg), msg) + +struct mig_page* migration_page = 0; +#define MAX_COW_PAGES 1024 +unsigned long* cow_pages; +unsigned long* cow_pfns; +int num_cow_pages; + +typedef unsigned long mfn_t; +typedef unsigned long pfn_t; + +struct dict_entry { mfn_t mfn; pfn_t pfn; }; +struct dict_entry* physmap_copy; + +static void __unmap_mm(struct mm_struct*); +static void unmap_mm(struct mm_struct*); +static void __unmap_kernel(void); +static void unmap_kernel(void); + +void set_dirty_checkpoint_pfn(unsigned long); +void set_dirty_checkpoint_mfn(unsigned long); +void checkpoint_unmap_mm(struct mm_struct*); +void perhaps_flush_wrpt(void); + +static void* no_unmap_bitmap = NULL; +static void* unmapped_pgds_bitmap; +void* dirty_bitmap0; +void* dirty_bitmap1; +void* cow_bitmap; +void* l1_bitmap; +void* l2_bitmap; + +void** ptrlist[] = { + &no_unmap_bitmap, /* this one must be first */ + &dirty_bitmap0, + &dirty_bitmap1, + &cow_bitmap, + &l1_bitmap, + &l2_bitmap, + &unmapped_pgds_bitmap }; + +unsigned long cur_pgd_phys; + + +//#define SELFMIG_CHECKPOINT_CHECK + +DECLARE_WAIT_QUEUE_HEAD(waiting_for_checkpoint); + +static inline int +checkpoint_update_va_mapping( + unsigned long va, pte_t new_val, unsigned long flags) +{ + unsigned long pte_hi = 0; + +flags = UVMF_TLB_FLUSH | UVMF_ALL; + +#ifdef CONFIG_X86_PAE + pte_hi = new_val.pte_high; +#endif + return _hypercall4(int, update_va_mapping, va, + new_val.pte_low, pte_hi, flags); +} + +#ifdef SELFMIG_CHECKPOINT_CHECK + +static void ccheck_init(void) +{ + int rc; + int num_pages = 2*max_low_pfn; + xen_memory_reservation_t mem; + int i; + + migration_page->mfn_list = (unsigned long *)vmalloc(num_pages * sizeof(unsigned long)); + memset(&mem,0,sizeof(mem)); + + mem.extent_start = migration_page->mfn_list; + mem.nr_extents = num_pages; + mem.domid = DOMID_SELF; + + rc = HYPERVISOR_memory_op( XENMEM_increase_reservation, &mem); + + if(!rc) + { + printk("not enough extra xen-memory for consistency check!\n"); + return; + } + + /* clear all pages first */ + for(i=0; imfn_list[i]<mfn_list[2*i+1]<mfn_list[2*pfn]<mfn_list[2*i]<mfn_list[2*i+1]<>PAGE_SHIFT)); + + } + + } + if(diffs) + { + + printk("==> inconsistent page #%x, %d diffs!",i,diffs); + if(test_bit(i,cow_bitmap)) + { + printk(" (was in cow) "); + } + if(!memcmp(empty_zero_page,a,PAGE_SIZE)) + { + printk(" (a is zero) "); + } + if(!memcmp(empty_zero_page,b,PAGE_SIZE)) + { + printk(" (b is zero) "); + } + printk("\n\n"); + + } + } + + } + } + + printk("verification done\n"); +} + +#else +#define ccheck_init() +#define ccheck_backup_system_pages() +#define ccheck_remember_checkpointed_page(a,b) +#define ccheck_verify() +#endif + +void perhaps_flush_wrpt(void) +{ + if(migration_page->wrpt_need_flush) + { + mmu_update_t u; + u.ptr = 0; + u.val = 0; + BUG_ON(HYPERVISOR_mmu_update(&u, 0, NULL, DOMID_SELF) < 0); + + migration_page->wrpt_need_flush=0; + } +} +#if 0 +void checkpoint_switch_mm(struct mm_struct* prev,struct mm_struct* next) +{ + if(is_migrating()) + { + //printk("unmap %p\n",next); + checkpoint_unmap_mm(next); + } +} +#endif + +void set_p2m_prefix(pfn_t pfn,mfn_t mfn) +{ + if(unlikely(is_migrating()==1)) + { + physmap_copy[pfn].pfn=pfn; + physmap_copy[pfn].mfn=mfn; + } +} + +static inline pfn_t oldmfn_to_pfn(mfn_t mfn) +{ + struct dict_entry* first = physmap_copy; + struct dict_entry* middle; + int len = max_low_pfn; + int half; + + while (len > 0) + { + half = len >> 1; + middle = first + half; + + if (middle->mfn < mfn) + { + first = middle; + ++first; + len = len - half - 1; + } + else len = half; + } + + if(first->mfn != mfn) + { + xxprintk("could not resolve mfn: "); + print_long(mfn); + return ~0UL; + } + + return first->pfn; + +} + +void remap_pgd(pfn_t pgd_pfn) +{ + int i; + unsigned long* pdes; + + if(pgd_pfn >= max_low_pfn) + { + + xxprintk("pgd_pfn out of range!\n"); + for(;;); + + } + + if(test_and_set_bit(pgd_pfn,l2_bitmap)) + return; + + pdes = (unsigned long*) __va( pgd_pfn <>PGDIR_SHIFT; i++) + { + unsigned long pde = pdes[i]; + if(pde & _PAGE_PRESENT) + { + unsigned long* ptes; + unsigned long pfn; + pte_t pte; + int j; + + unsigned long mfn = pde>>PAGE_SHIFT; + + if(mfn==INVALID_P2M_ENTRY) continue; + + pfn = oldmfn_to_pfn(mfn); + + if(pfn == ~0UL) continue; + + if(pfn>max_low_pfn) + { + xxprintk("BAD pfn out of range!\n"); + continue; + } + + pdes[i] = ( pfn_to_mfn(pfn) <>PAGE_SHIFT; + + if(mfn==INVALID_P2M_ENTRY) continue; + + pfn = oldmfn_to_pfn(mfn); + //xprintk("%d -> %d\n",mfn,*pmfn); + + if(pfn != ~0UL) ptes[j] = + (pfn_to_mfn(pfn) <>9); + else + { + //xprintk("(%08lx) %d -> ??!\n",i<mfn - pb->mfn; +} + +void phys_swap(void* a,void* b, int size) +{ + struct dict_entry* pa = (struct dict_entry*)a; + struct dict_entry* pb = (struct dict_entry*)b; + + struct dict_entry tmp = *pa; + *pa = *pb; + *pb = tmp; +} + +#define read_cr3_machine() ({ \ + unsigned int __dummy; \ + __asm__ ( \ + "movl %%cr3,%0\n\t" \ + :"=r" (__dummy)); \ + (__dummy); \ +}) + +void recover(void) +{ + int i,j; + struct task_struct* p; + unsigned long cstrap_cr3; + struct mmuext_op op; + struct page* page; + pte_t pte; + + xxprintk("Recover\n"); + + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); + + xen_trap_init(); + + set_pte = do_set_pte; + + cstrap_cr3 = read_cr3_machine(); + print_long(cstrap_cr3); + + for ( i=0, j=0; i < max_pfn; i+=(PAGE_SIZE/sizeof(unsigned long)), j++ ) + { + pfn_to_mfn_frame_list[j] = + virt_to_machine(&phys_to_machine_mapping[i]) >> PAGE_SHIFT; + } + + /* here we remap page tables for all processes, taking care to write-protect + * or unprotect virtual mappings for pgds according to the context.pinned flag. + * + * at the time of the checkpoint, we set up a table which maps old + * mfns to phys-map indices, that saves of the canolicalize step that we + * used to have, and the Xen impl. still uses afaik. + */ + + sort(physmap_copy,max_low_pfn,sizeof(struct dict_entry), phys_cmp,phys_swap); + + memset(l1_bitmap,0,BITMAP_SIZE_BYTES); + memset(l2_bitmap,0,BITMAP_SIZE_BYTES); + + for_each_process(p) + { + if(p->mm) + { + struct mm_struct* mm = p->mm; + unsigned long va = (unsigned long) mm->pgd; + + if(va & 0xfff) + { + xxprintk("foobar pdes!\n"); + print_long(va); + for(;;); + } + + remap_pgd( __pa(va) >> PAGE_SHIFT ); + if (test_bit(PG_pinned, &virt_to_page(p->mm->pgd)->flags)) + { + checkpoint_update_va_mapping( + va, + pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO), + UVMF_TLB_FLUSH); + + xen_pgd_pin(__pa(mm->pgd)); + } + } + } + remap_pgd(virt_to_phys(init_mm.pgd) >> PAGE_SHIFT ); + + /* this is quite strange -- page_to_phys seems to give an mfn rather + * than a pfn, so we have to convert it + */ + + for(page = pgd_list; page->index; page=(struct page *)page->index) + { + remap_pgd(page_to_pfn(page)); + } + + xxprintk("start info "); + print_long(start_info_address); + memcpy(xen_start_info, (void*) start_info_address, sizeof(start_info_t)); + + /* cur_pgd_phys is remembered in the checkpoint */ + xen_pt_switch(cur_pgd_phys); + + pte.pte_low = xen_start_info->console_mfn<console_mfn, old_console_pfn); + + pte.pte_low = xen_start_info->store_mfn<store_mfn, old_store_pfn); + + + /* we no longer need the tmp pgd set up by cstrap, unpin it */ + op.cmd = MMUEXT_UNPIN_TABLE; + op.arg1.mfn = cstrap_cr3>>PAGE_SHIFT; + if(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) + xxprintk("unpin failed!\n"); +#if 0 + struct mmuext_op op; + op.cmd = MMUEXT_UNPIN_TABLE; + op.arg1.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +#endif + + set_migrating(0); + + + HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; + clear_fixmap(FIX_SHARED_INFO); + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); + + //xen_start_info->shared_info = xen_start_info->shared_info; + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info); + memset(empty_zero_page, 0, PAGE_SIZE); + + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = virt_to_mfn(pfn_to_mfn_frame_list); + + cpu_reinit(); + + migration_page=0; + free_checkpoint_pages(); + + gnttab_suspend(); + gnttab_resume(); + irq_resume(); + time_resume(); + + kmem_cache_shrink(pgd_cache); + + print_long(resume_stack); + + asm volatile ( + "movl resume_stack,%esp\n" \ + "popl %gs\n" \ + "popl %fs\n" \ + "popl %es\n" \ + "popl %ds\n" \ + + "popf\n" \ + "popa\n" \ + "ret\n" \ + ); + + +} + + +unsigned long migration_dst_ip; +unsigned short migration_src_port; +unsigned short migration_dst_port; + +void do_checkpoint_set_pte(pte_t* pteptr, pte_t pteval) +{ + set_dirty_checkpoint_pfn(__pa(pteptr)>>PAGE_SHIFT); + migration_page->wrpt_need_flush=1; + *pteptr = pte_mkchkpoint(pteval); +} + +extern void checkpoint_page_fault(void); +extern fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code); + +fastcall void do_checkpoint_page_fault(struct pt_regs *regs, unsigned long + error_code) +{ + unsigned long address; + address = read_cr2(); + + if(migration_page->nofaults) {xxprintk("pf: "); + print_long(address); + print_long(regs->eip);} + + migration_page->nofaults++; + + if((error_code & 3)==3) + { + pte_t* pte; + unsigned long flags; + unsigned long o; + + + if(address>=TASK_SIZE) + { + pte = lookup_address(address); + o = pte->pte_low; + } + else + { + pgd_t* cur_pgd = (pgd_t*) __va(read_cr3()); + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + int index = pgd_index(address); + + pgd = index + cur_pgd; + pud = pud_offset(pgd, address); + pmd = pmd_offset(pud, address); + pte = pte_offset_map(pmd, address); + o = pte->pte_low; + + /* it is safe to unmap the pte, because we know from the error_code + * that it is actually present, and because all we use it for below is + * to mark the pfn dirty if the pte-dirty bit is about to be changed */ + + pte_unmap(pte); + } + + + if((o&(_PAGE_CHKPOINT|_PAGE_PRESENT))==(_PAGE_CHKPOINT|_PAGE_PRESENT)) + { + pte_t new_pte; + + if(is_migrating()) + { + unsigned long pfn; + + spin_lock_irqsave(&migration_page->lock,flags); + + pfn = mfn_to_pfn(o>>PAGE_SHIFT); + + if(pfn>PAGE_SHIFT); + } + + spin_unlock_irqrestore(&migration_page->lock,flags); + } + + new_pte.pte_low = _PAGE_RW | (o&(~_PAGE_CHKPOINT)); + checkpoint_update_va_mapping(address,new_pte,0); + + + migration_page->nofaults--; + +#if 1 + + /* "stungun" hack to slow down faulting processes in final state of + * migration + */ + + if ( is_migrating()==1 && migration_page->credit < 0 && + migration_page->num_deltas>0) + { + struct task_struct* tsk = current; + + if(tsk && tsk->tgid != migration_page->migrator->tgid && + addressmm ) + { + DECLARE_WAITQUEUE(wait,current); + xxprintk("stun.\n"); + + add_wait_queue(&waiting_for_checkpoint,&wait); + set_current_state(TASK_UNINTERRUPTIBLE); + while(is_migrating() && migration_page->credit < 0) schedule(); + set_current_state(TASK_RUNNING); + xxprintk("unstun.\n"); + } + } +#endif + + return; + } + } + + migration_page->nofaults--; + do_page_fault(regs,error_code); +} + +EXPORT_SYMBOL(do_checkpoint_page_fault); + +static inline void* get_checkpoint_page(void) +{ + unsigned long va = get_zeroed_page(GFP_KERNEL); + set_bit( __pa(va)>>PAGE_SHIFT, no_unmap_bitmap); + return (void*)va; +} + +static int get_more_cow_pages(int n) +{ + int i; + for(i=num_cow_pages; i=PAGE_SIZE/sizeof(void*) || !( cow_pages[i] = (unsigned long)get_checkpoint_page())) + { + printk("!!!!!!!!! not enough pages for checkpoint!\n"); + + while(i>=0) free_page(cow_pages[--i]); + return -ENOMEM; + } + } + num_cow_pages += n; + return 0; +} + +int start_migration(unsigned long dst_ip, unsigned short src_port, + unsigned short dst_port) +{ + int i; + int order; + unsigned long pfn; + int rc; + struct mm_struct* mm; + struct task_struct* p; + + trap_info_t trap_table[] = { + { 14, 0|4, __KERNEL_CS, (unsigned long)checkpoint_page_fault}, + { 0, 0, 0, 0 } + }; + + xxprintk("start migration!\n"); + + kfree(physmap_copy); + + num_cow_pages = 0; + + migration_dst_ip = dst_ip; + migration_src_port = src_port; + migration_dst_port = dst_port; + + + order=get_order(BITMAP_SIZE_BYTES); + + for(i=0; i>PAGE_SHIFT; + set_bit( pfn+j, no_unmap_bitmap); + } + } + + cow_pages = (unsigned long*)get_zeroed_page(GFP_KERNEL); + cow_pfns = get_checkpoint_page(); + + physmap_copy = kmalloc(max_low_pfn*sizeof(struct dict_entry),GFP_KERNEL); + + rc = get_more_cow_pages(64); + if(rc < 0) return rc; + + /* current process cannot survive with read-only ring0 stack */ + pfn = __pa(current_stack_pointer & 0xffffe000) >>PAGE_SHIFT; + set_bit(pfn, no_unmap_bitmap); + set_bit(pfn+1, no_unmap_bitmap); + + migration_page = get_checkpoint_page(); + + spin_lock_init(&migration_page->lock); + migration_page->migrator=current; + + ccheck_init(); + + /* + * First we are going to send a copy of the entire memory. + * We mark pages RO before sending them, so we will know if they are + * modified later. + * + */ + + migration_page->dirty_bitmap = dirty_bitmap0; + migration_page->send_bitmap = dirty_bitmap1; + + memset(migration_page->send_bitmap, 0, BITMAP_SIZE_BYTES); + + mm = current->active_mm; + down_read(&mm->mmap_sem); + spin_lock_irq(&mmlist_lock); + set_migrating(1); + HYPERVISOR_set_trap_table(trap_table); + + set_pte = do_checkpoint_set_pte; + + /* set up a table which will allow us to remap page tables when recovering + * the checkpoint. The table will be kept updated during checkpointing, and + * by sorting it on the mfn key we can use it as a dictionary later on. + */ + + for(i=0; imm) unmap_mm(p->mm); + } + +#if 0 + spin_lock(&mm->page_table_lock); + __unmap_mm(mm); + __unmap_kernel(); + spin_unlock(&mm->page_table_lock); +#endif + + /* initially, all pages are queued for transmission */ + memset(migration_page->send_bitmap, 0xff, BITMAP_SIZE_BYTES); + + spin_unlock_irq(&mmlist_lock); + up_read(&mm->mmap_sem); + + + return 0; + +} + +void noinline checkpoint_fork(void) +{ + /* here we manually copy the kernel stack pages of the migrator, so + * that we can correctly fork it. By popping registers and eip from the + * resume_stack, the forked 'child' will be able to continue from here + */ + + asm volatile ( + "pushal\n" \ + "pushf\n" \ + "pushl %ds\n" \ + "pushl %es\n" \ + "pushl %fs\n" \ + "pushl %gs\n" \ + + "movl %esp, %esi\n" \ + "andl $0xffffe000,%esi\n" \ + + + "movl cow_pages, %eax\n" \ + "movl (%eax), %edi\n" \ + "movl $0x400,%ecx\n" \ + "cld\n" \ + "rep\n" \ + "movsl\n" \ + + "movl 4(%eax), %edi\n" \ + "movl $0x400,%ecx\n" \ + "cld\n" \ + "rep\n" \ + "movsl\n" \ + + "movl %esp,resume_stack\n" \ + + "addl $4*4, %esp\n" \ + "popf\n" \ + "popal\n" \ + "ret\n" \ + ); + +} + +static noinline void checkpoint_backup_page(unsigned long pfn) +{ + void* addr; + unsigned long dst; + int idx; + pte_t* pte_k; + + /* protect against double entries */ + + if(test_and_set_bit(pfn, cow_bitmap)) return; + + + if(pfn>max_low_pfn || pfn_to_mfn(pfn) == INVALID_P2M_ENTRY + || pfn_to_mfn(pfn) == 0x55555555) + { + xxprintk("INVALID!\n"); + print_long(pfn); + return; + } + + addr = __va(pfn<cow_idx++; + if(idx==MAX_COW_PAGES) + { + printk(" +++++++++++ out of cow.\n"); + for(;;); + } + //BUG_ON(idx==MAX_COW_PAGES); + + cow_pfns[idx] = pfn; + dst = cow_pages[idx]; + + perhaps_flush_wrpt(); + + +#ifdef SELFMIG_CHECKPOINT_CHECK + { + int j; + unsigned long dst = fix_to_virt(FIX_CHECKPOINT_TMP); + unsigned long*b = (unsigned long*)dst; + unsigned long*a = (unsigned long*)addr; + + int diffs=0; + + pte_t pte; + pte.pte_low = (migration_page->mfn_list[2*pfn+1]<mm) + { + struct mm_struct* mm = p->mm; + unsigned long* pgd = (unsigned long*) mm->pgd; + int k; + + xxprintk("process: "); xxprintk(p->comm); + xxprintk("\n"); + + for(k=0; k>22; k++) + { + if(pgd[k]>>PAGE_SHIFT== pfn_to_mfn(pfn)) + { + xxprintk("\nin pgd at "); print_long(k); + } + } + + + } + } + } + +#if 0 + if(b[j] & 1) + { + int k; + unsigned long*pgd = __va(read_cr3()); + for(k=0; k>22; k++) + { + if(pgd[k]>>PAGE_SHIFT== b[j]>>PAGE_SHIFT) + { + xxprintk("in pgd at "); print_long(k); + } + } + } +#endif + + } + } + + if(diffs) + { + xxprintk("aargh bugged page at "); print_long((unsigned long)addr); + xxprintk("proc is "); xxprintk(current->comm); xxprintk("\n"); + xxprintk("diffs "); print_long(diffs); + xxprintk("cr3" ); print_long((unsigned long) __va(read_cr3())); + if(in_interrupt()) xxprintk(" in interr\n"); + pte_t* pte = lookup_address((unsigned long)addr); + xxprintk("mapping pte "); print_long((unsigned long) pte->pte_low); + xxprintk("nofaults "); print_long(migration_page->nofaults); + + struct page* page = pfn_to_page(pfn); + if(test_bit(PG_foreign, &(page)->flags)) + { + xxprintk(" is foreign page!\n"); + + } + + //HYPERVISOR_shutdown(0); + } + } +#endif + + memcpy( (void*) dst, addr, PAGE_SIZE); +} + +void set_dirty_checkpoint_pfn(unsigned long pfn) +{ + if(pfn>=max_low_pfn) + { + xxprintk("BAD Pfn\n"); + return; + //BUG(); + } + + if(is_migrating()==1) + { + if(test_and_set_bit(pfn, migration_page->dirty_bitmap)) + { + migration_page->credit--; + } + else migration_page->wss++; + + if(test_and_clear_bit(pfn, migration_page->send_bitmap)) + migration_page->credit++; + } + else if(is_migrating()==2) + { + if(test_bit(pfn, migration_page->send_bitmap)) + { + checkpoint_backup_page(pfn); + } + } + +} + +void set_dirty_checkpoint_mfn(unsigned long mfn) +{ + if(mfn_to_pfn(mfn)>=max_low_pfn) + { + xxprintk("bad pfn from mfn "); print_long(mfn); + return; + //BUG(); + } + else set_dirty_checkpoint_pfn(mfn_to_pfn(mfn)); +} + +void set_dirty_va_mapping(unsigned long address) +{ + pte_t *pte; + + if(address>=TASK_SIZE) + { + pte = lookup_address(address); + } + else + { + pgd_t* cur_pgd = __va(read_cr3()); //current->active_mm->pgd; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + int index = pgd_index(address); + + pgd = index + cur_pgd; + pud = pud_offset(pgd, address); + pmd = pmd_offset(pud, address); + pte = pte_offset_map(pmd, address); + } + if(!pte) {xxprintk("NULL result for "); print_long(address); } + + set_dirty_checkpoint_pfn(__pa(pte)>>PAGE_SHIFT); + + if(addressdirty_bitmap); + unsigned long* y = (unsigned long*) (&migration_page->send_bitmap); + + *x ^= *y; + *y ^= *x; + *x ^= *y; +} + +int delta_checkpoint(void) +{ + int i; + int cow_mode=0; + int rc=0; + int flags; + + struct mm_struct* mm = current->active_mm; + int oldnofaults = migration_page->nofaults; + struct task_struct* p; + + xxprintk("delta ----\n"); + + if( (num_cow_pages - migration_page->wss) < 4) + { + get_more_cow_pages(64); + } + + + /* after taking the lock below, we will not allow writes to other areas of + * mem than the current stack and our specially allocated pages in here, as + * otherwise the checkpoint will be inconsistent + */ + + down_read(&mm->mmap_sem); + spin_lock_irq(&migration_page->lock); + + if( (num_cow_pages - migration_page->wss) >= 4) + cow_mode=1; + + /* wss fits in cow-pages, prepare for fork */ + if(cow_mode) + { + unsigned long esp_top, pfn; + + /* here is some stuff we need to know when resuming */ + old_console_pfn = mfn_to_pfn(xen_start_info->console_mfn); + old_store_pfn = mfn_to_pfn(xen_start_info->store_mfn); + cur_pgd_phys = read_cr3(); + + //xenbus_suspend(); + + /* we take special care of the current stack pages, so that even + * the migrator process will survive migration */ + + esp_top = current_stack_pointer & 0xffffe000; + pfn = __pa(esp_top) >>PAGE_SHIFT; + + for(i=0; i<2; i++) + { + set_bit(pfn+i, cow_bitmap); + set_bit(pfn+i, migration_page->dirty_bitmap); + cow_pfns[i] = pfn+i; + } + + migration_page->cow_idx=i; // assume 8kB stack + + checkpoint_fork(); + + + if(!is_migrating()) + { + xxprintk("__sti!\n"); + + __sti(); + + xenbus_reset(); + xencons_resume(); + xenbus_resume(); + +#ifdef CONFIG_XEN_GFX_DRIVER + extern void gfx_resume(void); + gfx_resume(); +#endif + + printk("unforked checkpoint.\n"); + + rc=1; + goto out; + } + } + + + if(cow_mode) + { + migration_page->nofaults++; + /* all write faults from now on will result in copy-on-write */ + set_migrating(2); + } + + swap_buffers(); + + memset(unmapped_pgds_bitmap,0,BITMAP_SIZE_BYTES); + migration_page->wss = 0; + + spin_lock_irqsave(&mm->page_table_lock,flags); + + for_each_process(p) + { + struct mm_struct* mm = p->mm; + if(mm) __unmap_mm(mm); + } + __unmap_kernel(); + +#if 0 + __unmap_mm(mm); + __unmap_kernel(); // TODO take pagetable lock first +#endif + + spin_unlock_irqrestore(&mm->page_table_lock,flags); + + if(cow_mode) + { + ccheck_backup_system_pages(); + +#if 1 + for(i=0; iflags) && + test_bit(i,migration_page->send_bitmap)) + { + xxprintk("foreign: "); print_long(i); + checkpoint_backup_page(i); + } + } +#endif + + migration_page->cpu_state.eip = (unsigned long) &recover0; + migration_page->cpu_state.esp = resume_stack; + migration_page->cpu_state.first_phys_map_pfn = __pa(phys_to_machine_mapping)>>PAGE_SHIFT; + + } + + migration_page->nofaults = oldnofaults; + + +out: + spin_unlock_irq(&migration_page->lock); + up_read(&mm->mmap_sem); + + return rc; +} + +static inline ssize_t copy_page_to_user(char* buf, unsigned long src_pfn, unsigned long pfn) +{ + ssize_t copied = 0; + int rc; + + pte_t pte; + pte_t no_pte = {0}; + unsigned long addr = fix_to_virt(FIX_CHECKPOINT_TMP2); + + if(!put_user(pfn, (unsigned long*)buf)) copied+=sizeof(pfn); + + pte.pte_low = ( pfn_to_mfn(src_pfn)<cpu_state, sizeof(migration_page->cpu_state))) + return -EFAULT; + + copied+=sizeof(migration_page->cpu_state); + } + + else /* first+second phases, full copy + resend-on-write */ + { + while(migration_page->last_read_pfnlast_read_pfn++; + + if(test_and_clear_bit(i, migration_page->send_bitmap)) //EXP + { + unsigned long pfn = i; + int c; + + /* copy pages to checkpoint, checking for pristine + * backup copies in the cow pages, and redoing if a + * cow-backup is made during copy. */ + + c = test_bit(i, cow_bitmap); + do + { + if(c) + { + int j; + + for(j=0; jcow_idx && + cow_pfns[j]!=i; j++); + + if(jcow_idx) + { + pfn = __pa(cow_pages[j])>>PAGE_SHIFT; + } + else + { + xxprintk("no cow page for "); + BUG(); + } + + } + copied = copy_page_to_user(buf, pfn, i); + } + while(c ^= test_bit(i, cow_bitmap)); + + if(copied) migration_page->copied_pages++; + break; + } + } + + if(migration_page->last_read_pfn==max_low_pfn) + { + print_long(migration_page->wss); + if(is_migrating()==2) + { + set_migrating(0); + free_checkpoint_pages(); + } + else + { + if( delta_checkpoint() ) + { + /* if we get here, migration was succesfull and we are + * now on the 'other side'. Signal this to the user + * process */ + + copied = -EIO; + } + else + { + migration_page->last_read_pfn=0; + migration_page->prev_copied_pages = migration_page->copied_pages; + migration_page->copied_pages=0; + + migration_page->num_deltas++; + } + } + + } + + } + + } while(copied==0); + +#if 1 + if(is_migrating()==0 || migration_page->credit++ > 100) + { + wake_up(&waiting_for_checkpoint); + } +#endif + + return copied; +} + +static void __unmap_kernel(void) +{ + unsigned long address; + + wmb(); + + for(address = PAGE_OFFSET; address> PAGE_SHIFT; + if(test_bit(pfn, no_unmap_bitmap)) continue; + } + if(address==(unsigned long)HYPERVISOR_shared_info) continue; + + pgd = pgd_offset_k(address); + if(pgd_none(*pgd)) continue; + + pud = pud_offset(pgd,address); + if(pud_none(*pud)) continue; + + pmd = pmd_offset(pud,address); + if(pmd_none(*pmd)) continue; + + pte_k = pte_offset_kernel(pmd,address); + + if (pte_k && pte_present(*pte_k)) + { + if(pte_write(*pte_k)) + { + *pte_k = pte_mkchkpoint(*pte_k); + } + } + } + wmb(); + flush_tlb_all(); +} + +static void unmap_kernel(void) +{ + struct mm_struct* mm = current->active_mm; + down_read(&mm->mmap_sem); + spin_lock_irq(&mm->page_table_lock); + + __unmap_kernel(); + + spin_unlock_irq(&mm->page_table_lock); + up_read(&mm->mmap_sem); +} + + + +static void __unmap_mm(struct mm_struct* mm) +{ + unsigned long address; + //migration_page->nofaults++; + + wmb(); + + for(address = 0L; addressnofaults--; +} + +void unmap_mm(struct mm_struct* mm) +{ + down_read(&mm->mmap_sem); + spin_lock_irq(&mm->page_table_lock); + + __unmap_mm(mm); + + spin_unlock_irq(&mm->page_table_lock); + up_read(&mm->mmap_sem); +} + + + +#if 0 +void checkpoint_unmap_mm(struct mm_struct* mm) +{ + //xxprintk("unmap mm "); print_long(mm->pgd); + if((is_migrating()==1 || is_migrating()==2) && + !test_and_set_bit(__pa(mm->pgd)>>PAGE_SHIFT,unmapped_pgds_bitmap)) + { + int flags; + + // down_read(&mm->mmap_sem); + // spin_lock_irqsave(&mm->page_table_lock,flags); + __unmap_mm(mm); + // spin_unlock_irqrestore(&mm->page_table_lock,flags); +// up_read(&mm->mmap_sem); + } +} + +EXPORT_SYMBOL(checkpoint_unmap_mm); +#endif + + + +static int checkpoint_open(struct inode * inode, struct file * filp) +{ + printk("checkpoint_open\n"); + //start_migration(0,0,0); + return 0; +} + +static int checkpoint_ioctl(struct inode *pino, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + extern unsigned long max_low_pfn; + + + if(cmd==1) + { + struct a { unsigned long ip; unsigned short src_port, dst_port; }; + struct a as; + + if(copy_from_user(&as, (void*)arg, sizeof(as))) + return -EFAULT; + + if( start_migration(as.ip,as.src_port,as.dst_port)==0) + return max_low_pfn; + else return -ENOMEM; + } + else return -EINVAL; +} + +/* TODO sendfile() interface */ + +static ssize_t read_checkpoint(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + extern unsigned long max_low_pfn; + ssize_t total_read=0; + + if (!count) + return 0; + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + + if(*ppos==0) + { + if( start_migration(0,0,0)==0) + { + put_user(max_low_pfn, (unsigned long*)buf); + total_read += sizeof(unsigned long); + } + else return -ENOMEM; + } + + while(count-total_read >= PAGE_SIZE+4) + { + ssize_t got = read_next_page(buf+total_read); + + if(got>=0) total_read += got; + else return got; + + if(got!= PAGE_SIZE+4) break; + } + + *ppos += total_read; + + //printk("read %d\n",total_read); + return total_read; +} + +static struct file_operations checkpoint_fops = { + .read = read_checkpoint, + .ioctl = checkpoint_ioctl, + .open = checkpoint_open, +}; + +#define CHECKPOINT_MINOR 203 + +static struct miscdevice checkpoint_miscdev = { + .minor = CHECKPOINT_MINOR, + .name = "checkpoint", + .fops = &checkpoint_fops, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + .devfs_name = "misc/checkpoint", +#endif +}; + +static int __init checkpoint_init(void) +{ + int err; + + /* (DEVFS) create '/dev/misc/checkpoint'. */ + err = misc_register(&checkpoint_miscdev); + if (err != 0) + { + printk(KERN_ALERT "Could not register /dev/misc/checkpoint\n"); + return err; + } + + printk("Checkpoint device installed.\n"); + + return 0; +} + +static void checkpoint_cleanup(void) +{ + misc_deregister(&checkpoint_miscdev); +} + +module_init(checkpoint_init); +module_exit(checkpoint_cleanup); diff -r 1e225598ce82 linux-2.6-xen-sparse/include/mach-xen/asm/migrate.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/linux-2.6-xen-sparse/include/mach-xen/asm/migrate.h Tue Oct 10 07:00:16 2006 -0700 @@ -0,0 +1,79 @@ + +#define MAX_DIRTY_MMS 128 +#define MAX_FAULTEES 128 +struct dirty_mm +{ + struct mm_struct* mm; + unsigned short num_faults; +}; + + +struct mig_page { + int migrating; // needs to be first entry so struct can be cast to int* and this checked + void* dirty_bitmap; // needs to be second, for similar reason + void* send_bitmap; + + int nofaults; + + spinlock_t lock; + struct task_struct* migrator; + + unsigned int num_deltas,last_read_pfn,copied_pages,prev_copied_pages; + struct mm_struct* dirty_mms[MAX_DIRTY_MMS]; + + unsigned int num_dirty_pages; + unsigned int num_dirty_mms; + + unsigned long cur_pgd; + + int wrpt_need_flush; + + int cow_idx; + int credit; + int wss; + unsigned long* mfn_list; + + struct + { + unsigned long eip; + unsigned long esp; + unsigned long first_phys_map_pfn; + } cpu_state; +}; + +extern struct mig_page* migration_page; +extern struct mig_page_queue migration_queue; + +static inline int is_migrating(void) +{ + if(!migration_page) return 0; + else return migration_page->migrating; +} +static inline void set_migrating(int m) +{ + migration_page->migrating=m; + wmb(); +} + +static inline pte_t pte_mkchkpoint(pte_t pte) +{ + pte.pte_low = ((pte.pte_low & _PAGE_RW)<<9) | + (pte.pte_low & ~_PAGE_RW); + return pte; +} + +static inline void print_long(unsigned long a) +{ + char string[10]; + char hex[] = "0123456789abcdef"; + int i; + for(i=0; i<8; i++) + { + string[7-i] = hex[a & 0xf]; + a>>=4; + } + string[8] = '\n'; + string[9] = '\0'; + + HYPERVISOR_console_io(CONSOLEIO_write, 10, string); +}