Linux启动代码分析

来源:百度文库 编辑:神马文学网 时间:2024/04/29 13:37:52
Kernel: 2.6.10-rc2
Finished: 01/01/05


/*
*    Activate the first processor.
*/

asmlinkage void __init start_kernel(void)
{
   char * command_line;
   extern struct kernel_param __start___param[], __stop___param[];
/*
* Interrupts are still disabled. Do necessary setups, then
* enable them
*/
   lock_kernel();        /* 给kernel上锁 */
   page_address_init();    /* 在配置highmem才作工作 */
   printk(linux_banner);    /* 打印kernel版本信息 */
   setup_arch(&command_line); /* 设置体系结构相关信息,包括页面映射,acpi等 */
   setup_per_cpu_areas();    /* 设置smp中每个cpu区域偏移量信息 */

   /*
    * Mark the boot cpu "online" so that it can call console drivers in
    * printk() and can access its per-cpu storage. 设置引导cpu在工作状态
    */
   smp_prepare_boot_cpu();

   /*
    * Set up the scheduler prior starting any interrupts (such as the
    * timer interrupt). Full topology setup happens at smp_init()
    * time - but meanwhile we still have a functioning scheduler.
    */
   sched_init();    /* runqueue设置 */
   build_all_zonelists();    /* 建立分配策略 */
   page_alloc_init();    /* hotplug CPU设置 */
   printk("Kernel command line: %s\n", saved_command_line);
   parse_early_param();
   parse_args("Booting kernel", command_line, __start___param,
          __stop___param - __start___param,
          &unknown_bootoption);    /* 对传入内核参数作分析,并作相应设置 */
   sort_main_extable();    /* 异常处理调用函数表排序 */
   trap_init();        /* 重新设置中断向量表 */
   rcu_init();        /* 初始化RCU(Read-Copy Update),主要是一个per_cpu_rcu_tasklet */
   init_IRQ();        /* 中断服务队列初始化,但没有具体中断处理函数入口,在request_irq()向系统注册 */
   pidhash_init();        /* pidhash表初始化,共5个,是不是每个表中保存不同类型pid? */
   init_timers();        /* 初始化一个per_cpu_tvec_bases队列,并设置TIMER_SOFTIRQ */
   softirq_init();        /* 初始化软中断和tasklet */
   time_init();        /* 硬件时钟及其中断初始化 */

   /*
    * HACK ALERT! This is early. We're enabling the console before
    * we've done PCI setups etc, and console_init() must be aware of
    * this. But we do want output early, in case something goes wrong.
    */
   console_init();
   if (panic_later)
       panic(panic_later, panic_param);
   profile_init();        /* profile设置 */
   local_irq_enable();    /* 开中断 */
#ifdef CONFIG_BLK_DEV_INITRD
   if (initrd_start && !initrd_below_start_ok &&
           initrd_start < min_low_pfn << PAGE_SHIFT) {
       printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - "
          "disabling it.\n",initrd_start,min_low_pfn << PAGE_SHIFT);
       initrd_start = 0;
   }
#endif
   vfs_caches_init_early();    /* 初始化dentry和inode缓冲队列的hash表 */
   mem_init();            /* 最后内存初始化,释放前边标志为保留的所有页面 */
   kmem_cache_init();        /* slab初始化 */
   numa_policy_init();        /* ?????????????????????? */
   if (late_time_init)
       late_time_init();
   calibrate_delay();        /* 计算BogoMIPS */
   pidmap_init();            /* 初始化pid位图 */
   pgtable_cache_init();        /* pgd,pmd slab初始化 */
   prio_tree_init();        /* 初始化index_bits_to_maxindex,For (struct page)->mapping->i_map*/
   anon_vma_init();        /* anon_vma slab初始化,用于对rmap支持 */
#ifdef CONFIG_X86
   if (efi_enabled)
       efi_enter_virtual_mode();
#endif
   fork_init(num_physpages);    /* 计算系统最大安全进程数,设置当前进程最大进程数 */
   proc_caches_init();        /* 其他slab初始化 */
   buffer_init();            /* buffer head初始化 */
   unnamed_dev_init();        /* ?????what is idr????? */
   security_init();        /* security 初始化 */
   vfs_caches_init(num_physpages);    /* **vfs需要的cache初始化** */
   radix_tree_init();        /* radix_tree初始化,该功能主要加速look up dirty or writeback pages */
   signals_init();            /* 创建sigqueue slab */
   /* rootfs populating might need page-writeback */
   page_writeback_init();        /* 计算当前系统vm-radio等,设置是否需要回写操作 */
#ifdef CONFIG_PROC_FS
   proc_root_init();        /* proc文件系统初始化,并根据配置建立相应的目录和文件 */
#endif
   check_bugs();

   acpi_early_init(); /* before LAPIC and SMP init */

   /* Do the rest non-__init'ed, we're now alive */
   rest_init();            /* 建立init进程 */
}

/* arch/i386/kernel/setup.c */
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
* for initialization. Note, the efi init code path is determined by the
* global efi_enabled. This allows the same kernel image to be used on existing
* systems (with a traditional BIOS) as well as on EFI systems.
* 检测是否是通过EFI引导kernel.如果是,将通过efi导入memmap, systab等,因此用此数据
* 结构进行初始化。
* Note: efi初始化路径是在全觉efi_enabled决定的(是否配置efi_enable?)。
*/
void __init setup_arch(char **cmdline_p)
{
   unsigned long max_low_pfn;

   memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
   pre_setup_arch_hook();    /* 执行某些体系结构相关的hook程序, i386是空 */
   early_cpu_init();    /* 设置获取的cpu信息 */

   /*
    * FIXME: This isn't an official loader_type right
    * now but does currently work with elilo.
    * If we were configured as an EFI kernel, check to make
    * sure that we were loaded correctly from elilo and that
    * the system table is valid. If not, then initialize normally.
    */
#ifdef CONFIG_EFI
   if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
       efi_enabled = 1;
#endif
   /* 从setup中取得BIOS自检后取得的信息,复制到内核内存空间中(原来保存在一个临时页面中) */
   ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
   drive_info = DRIVE_INFO;
   screen_info = SCREEN_INFO;
   edid_info = EDID_INFO;
   apm_info.bios = APM_BIOS_INFO;
   ist_info = IST_INFO;
   saved_videomode = VIDEO_MODE;
   if( SYS_DESC_TABLE.length != 0 ) {
       MCA_bus = SYS_DESC_TABLE.table[3] &0x2;
       machine_id = SYS_DESC_TABLE.table[0];
       machine_submodel_id = SYS_DESC_TABLE.table[1];
       BIOS_revision = SYS_DESC_TABLE.table[2];
   }
   aux_device_present = AUX_DEVICE_INFO;

#ifdef CONFIG_BLK_DEV_RAM
   rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
   rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
   rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
#endif
   ARCH_SETUP    /* x86系列没有任何的动作 */
   if (efi_enabled)
       efi_init();
   else {
       printk(KERN_INFO "BIOS-provided physical RAM map:\n");
       print_memory_map(machine_specific_memory_setup()); /* 处理内存图,最后保存在e820中 */
   }

   copy_edd();    /* 复制增强磁盘参数(来之setup自检信息),实验性质,CONFIG_EDD */

   if (!MOUNT_ROOT_RDONLY)
       root_mountflags &= ~MS_RDONLY;
   init_mm.start_code = (unsigned long) _text;
   init_mm.end_code = (unsigned long) _etext;
   init_mm.end_data = (unsigned long) _edata;
   init_mm.brk = init_pg_tables_end + PAGE_OFFSET;

   code_resource.start = virt_to_phys(_text);
   code_resource.end = virt_to_phys(_etext)-1;
   data_resource.start = virt_to_phys(_etext);
   data_resource.end = virt_to_phys(_edata)-1;

   parse_cmdline_early(cmdline_p);    /* 分析引导时用户提供的启动参数(例如mem=xxx,acpi=xx,and so on) */

   max_low_pfn = setup_memory();    /* 为页面映射作基础工作(生成map) */

   /*
    * NOTE: before this point _nobody_ is allowed to allocate    到现在依然不可以用bootmem内存分配器来
    * any memory using the bootmem allocator. Although the    分配内存,在执行paging_init()以前必须
    * alloctor is now initialised only the first 8Mb of the kernel    用alloc_bootmem_low_pages()来分配内存
    * virtual address space has been mapped. All allocations before
    * paging_init() has completed must use the alloc_bootmem_low_pages()
    * variant (which allocates DMA'able memory) and care must be taken
    * not to exceed the 8Mb limit.
    */

#ifdef CONFIG_SMP
   smp_alloc_memory(); /* AP processor realmode stacks in low memory 为启动smp其他cpu分配内存 */
#endif
   paging_init();    /* 页面信息初始化 */

   /*
    * NOTE: at this point the bootmem allocator is fully available.
    */

#ifdef CONFIG_EARLY_PRINTK
   {
       char *s = strstr(*cmdline_p, "earlyprintk=");
       if (s) {
           extern void setup_early_printk(char *);

           setup_early_printk(s);
           printk("early console enabled\n");
       }
   }
#endif


   dmi_scan_machine(); /* DMI=Desktop Management Interface */

#ifdef CONFIG_X86_GENERICARCH
   generic_apic_probe(*cmdline_p);    /* 检测APIC(高级可编程中断器) */
#endif    
   if (efi_enabled)
       efi_map_memmap();

   /*
    * Parse the ACPI tables for possible boot-time SMP configuration.
    */
   acpi_boot_init();

#ifdef CONFIG_X86_LOCAL_APIC
   if (smp_found_config)
       get_smp_config();
#endif

   register_memory(max_low_pfn);    /* 对系统I/O资源生成资源树 */

#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
   if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
       conswitchp = &vga_con;
#elif defined(CONFIG_DUMMY_CONSOLE)
   conswitchp = &dummy_con;
#endif
#endif
}

/* arch/i386/kernel/cpu/common.c */
void __init early_cpu_init(void)
{
   /* 目前支持9中x386系列cpu,分别赋值给cpu_devs */
   intel_cpu_init();    /* Intel CPU结构赋值 */
   cyrix_init_cpu();
   nsc_init_cpu();
   amd_init_cpu();
   centaur_init_cpu();
   transmeta_init_cpu();
   rise_init_cpu();
   nexgen_init_cpu();
   umc_init_cpu();
   early_cpu_detect();    /* 检测cpu信息,并将检测得到信息给boot_cpu_data */

#ifdef CONFIG_DEBUG_PAGEALLOC
   /* pse is not compatible with on-the-fly unmapping,
    * disable it even if the cpus claim to support it.
    */
   clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
   disable_pse = 1;
#endif
}

/* arch/i386/kernel/setup.c */
static void __init print_memory_map(char *who)
{
   int i;

   for (i = 0; i < e820.nr_map; i++) {
       printk(" %s: %016Lx - %016Lx ", who,
           e820.map.addr,
           e820.map.addr + e820.map.size);
       switch (e820.map.type) {
       case E820_RAM:    printk("(usable)\n");
               break;
       case E820_RESERVED:
               printk("(reserved)\n");
               break;
       case E820_ACPI:
               printk("(ACPI data)\n");
               break;
       case E820_NVS:
               printk("(ACPI NVS)\n");
               break;
       default:    printk("type %lu\n", e820.map.type);
               break;
       }
   }
}

/* arch/i386/kernel/setup.c */
static void __init parse_cmdline_early (char ** cmdline_p)
{
   char c = ' ', *to = command_line, *from = saved_command_line;
   int len = 0;
   int userdef = 0;

   /* Save unparsed command line copy for /proc/cmdline */
   saved_command_line[COMMAND_LINE_SIZE-1] = '\0';

   for (;;) {
       /*
        * "mem=nopentium" disables the 4MB page tables.
        * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
        * to , overriding the bios size.
        * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
        * to +, overriding the bios size.
        *
        * HPA tells me bootloaders need to parse mem=, so no new
        * option should be mem= [also see Documentation/i386/boot.txt]
        */
       if (c == ' ' && !memcmp(from, "mem=", 4)) {
           if (to != command_line)
               to--;
           if (!memcmp(from+4, "nopentium", 9)) {
               from += 9+4;
               clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
               disable_pse = 1;
           } else {
               /* If the user specifies memory size, we
                * limit the BIOS-provided memory map to
                * that size. exactmap can be used to specify
                * the exact map. mem=number can be used to
                * trim the existing memory map.
                */
               unsigned long long mem_size;

               mem_size = memparse(from+4, &from);
               limit_regions(mem_size);
               userdef=1;
           }
       }

       if (c == ' ' && !memcmp(from, "memmap=", 7)) {
           if (to != command_line)
               to--;
           if (!memcmp(from+7, "exactmap", 8)) {
               from += 8+7;
               e820.nr_map = 0;
               userdef = 1;
           } else {
               /* If the user specifies memory size, we
                * limit the BIOS-provided memory map to
                * that size. exactmap can be used to specify
                * the exact map. mem=number can be used to
                * trim the existing memory map.
                */
               unsigned long long start_at, mem_size;

               mem_size = memparse(from+7, &from);
               if (*from == '@') {
                   start_at = memparse(from+1, &from);
                   add_memory_region(start_at, mem_size, E820_RAM);
               } else if (*from == '#') {
                   start_at = memparse(from+1, &from);
                   add_memory_region(start_at, mem_size, E820_ACPI);
               } else if (*from == '$') {
                   start_at = memparse(from+1, &from);
                   add_memory_region(start_at, mem_size, E820_RESERVED);
               } else {
                   limit_regions(mem_size);
                   userdef=1;
               }
           }
       }

#ifdef CONFIG_X86_SMP
       /*
        * If the BIOS enumerates physical processors before logical,
        * maxcpus=N at enumeration-time can be used to disable HT.
        */
       else if (!memcmp(from, "maxcpus=", 8)) {
           extern unsigned int maxcpus;

           maxcpus = simple_strtoul(from + 8, NULL, 0);
       }
#endif

#ifdef CONFIG_ACPI_BOOT
       /* "acpi=off" disables both ACPI table parsing and interpreter */
       else if (!memcmp(from, "acpi=off", 8)) {
           disable_acpi();
       }

       /* acpi=force to over-ride black-list */
       else if (!memcmp(from, "acpi=force", 10)) {
           acpi_force = 1;
           acpi_ht = 1;
           acpi_disabled = 0;
       }

       /* acpi=strict disables out-of-spec workarounds */
       else if (!memcmp(from, "acpi=strict", 11)) {
           acpi_strict = 1;
       }

       /* Limit ACPI just to boot-time to enable HT */
       else if (!memcmp(from, "acpi=ht", 7)) {
           if (!acpi_force)
               disable_acpi();
           acpi_ht = 1;
       }
       
       /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
       else if (!memcmp(from, "pci=noacpi", 10)) {
           acpi_disable_pci();
       }
       /* "acpi=noirq" disables ACPI interrupt routing */
       else if (!memcmp(from, "acpi=noirq", 10)) {
           acpi_noirq_set();
       }

       else if (!memcmp(from, "acpi_sci=edge", 13))
           acpi_sci_flags.trigger = 1;

       else if (!memcmp(from, "acpi_sci=level", 14))
           acpi_sci_flags.trigger = 3;

       else if (!memcmp(from, "acpi_sci=high", 13))
           acpi_sci_flags.polarity = 1;

       else if (!memcmp(from, "acpi_sci=low", 12))
           acpi_sci_flags.polarity = 3;

#ifdef CONFIG_X86_IO_APIC
       else if (!memcmp(from, "acpi_skip_timer_override", 24))
           acpi_skip_timer_override = 1;
#endif

#ifdef CONFIG_X86_LOCAL_APIC
       /* disable IO-APIC */
       else if (!memcmp(from, "noapic", 6))
           disable_ioapic_setup();
#endif /* CONFIG_X86_LOCAL_APIC */
#endif /* CONFIG_ACPI_BOOT */

       /*
        * highmem=size forces highmem to be exactly 'size' bytes.使用用户定义的highmem大小
        * This works even on boxes that have no highmem otherwise.即使配置内核没有选择此选项
        * This also works to reduce highmem size on bigger boxes.如果选择此选项也可能减少hignmem大小
        */
       if (c == ' ' && !memcmp(from, "highmem=", 8))
           highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
   
       /*
        * vmalloc=size forces the vmalloc area to be exactly 'size'
        * bytes. This can be used to increase (or decrease) the
        * vmalloc area - the default is 128m. 用户指定vmalloc大小代替缺省128m
        */
       if (c == ' ' && !memcmp(from, "vmalloc=", 8))
           __VMALLOC_RESERVE = memparse(from+8, &from);

       c = *(from++);
       if (!c)
           break;
       if (COMMAND_LINE_SIZE <= ++len)
           break;
       *(to++) = c;
   }
   *to = '\0';
   *cmdline_p = command_line;
   if (userdef) {
       printk(KERN_INFO "user-defined physical RAM map:\n");
       print_memory_map("user");
   }
}
static unsigned long __init setup_memory(void)
{
   unsigned long bootmap_size, start_pfn, max_low_pfn;

   /*
    * partially used pages are not usable - thus
    * we are rounding upwards:
    */
   start_pfn = PFN_UP(init_pg_tables_end);

   find_max_pfn();

   max_low_pfn = find_max_low_pfn();

#ifdef CONFIG_HIGHMEM
   highstart_pfn = highend_pfn = max_pfn;
   if (max_pfn > max_low_pfn) {
       highstart_pfn = max_low_pfn;
   }
   printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
       pages_to_mb(highend_pfn - highstart_pfn));
#endif
   printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
           pages_to_mb(max_low_pfn));
   /*
    * Initialize the boot-time allocator (with low memory only):
    */
   bootmap_size = init_bootmem(start_pfn, max_low_pfn);    /* 设置此区间页面为保留,好像结果在node_data[0]->bdata */

   register_bootmem_low_pages(max_low_pfn);    /* 设置所有可以使用内存页面位图 */

   /*
    * Reserve the bootmem bitmap itself as well. We do this in two
    * steps (first step was init_bootmem()) because this catches
    * the (very unlikely) case of us accidentally initializing the
    * bootmem allocator with an invalid RAM area.
    */
   reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) +
            bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY));    /* 保留内核在内存中的映像 */

   /*
    * reserve physical page 0 - it's a special BIOS page on many boxes,
    * enabling clean reboots, SMP operation, laptop functions.
    */
   reserve_bootmem(0, PAGE_SIZE);    /* 保留物理页面0, 主要是和启动有关的信息以及bios信息 */

   /* reserve EBDA region, it's a 4K region */
   reserve_ebda_region();

  /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent
    PCI prefetch into it (errata #56). Usually the page is reserved anyways,
    unless you have no PS/2 mouse plugged in. */
   if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
      boot_cpu_data.x86 == 6)
      reserve_bootmem(0xa0000 - 4096, 4096);

#ifdef CONFIG_SMP
   /*
    * But first pinch a few for the stack/trampoline stuff
    * FIXME: Don't need the extra page at 4K, but need to fix
    * trampoline before removing it. (see the GDT stuff)
    */
   reserve_bootmem(PAGE_SIZE, PAGE_SIZE);    /* 在SMP系统中需要使用 */
#endif
#ifdef CONFIG_ACPI_SLEEP
   /*
    * Reserve low memory region for sleep support.
    */
   acpi_reserve_bootmem();
#endif
#ifdef CONFIG_X86_FIND_SMP_CONFIG
   /*
    * Find and reserve possible boot-time SMP configuration:
    */
   find_smp_config();
#endif

#ifdef CONFIG_BLK_DEV_INITRD
   if (LOADER_TYPE && INITRD_START) {
       if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
           reserve_bootmem(INITRD_START, INITRD_SIZE);
           initrd_start =
               INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
           initrd_end = initrd_start+INITRD_SIZE;
       }
       else {
           printk(KERN_ERR "initrd extends beyond end of memory "
              "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
              INITRD_START + INITRD_SIZE,
              max_low_pfn << PAGE_SHIFT);
           initrd_start = 0;
       }
   }
#endif
   return max_low_pfn;
}


/* arch/i386/mm/init.c */
/*
* paging_init() sets up the page tables - note that the first 8MB are
* already mapped by head.S.
* 已经有8MB内存在head.S中映射完成
* This routines also unmaps the page at virtual kernel address 0, so
* that we can trap those pesky NULL-reference errors in the kernel.
*/
void __init paging_init(void)
{
#ifdef CONFIG_X86_PAE
   set_nx();
   if (nx_enabled)
       printk("NX (Execute Disable) protection: active\n");
#endif

   pagetable_init();    /* 修改系统空间页面表信息,原来在系统setup时已经设置好,但都是空的 */

   load_cr3(swapper_pg_dir);

#ifdef CONFIG_X86_PAE
   /*
    * We will bail out later - printk doesn't work right now so
    * the user would just see a hanging kernel.
    */
   if (cpu_has_pae)
       set_in_cr4(X86_CR4_PAE);
#endif
   __flush_tlb_all();    /* 刷新mmu */

   kmap_init();    /* highmem使用内存设定 */
   zone_sizes_init(); /* 内存初始化 pgdat_list->zone */
}

static void __init pagetable_init (void)
{
   unsigned long vaddr;
   pgd_t *pgd_base = swapper_pg_dir;

#ifdef CONFIG_X86_PAE    /* 用三级页面映射表(Physical Address Extension) */
   int i;
   /* Init entries of the first-level page table to the zero page */
   for (i = 0; i < PTRS_PER_PGD; i++)    /* PTRS_PER_PGD=4 */
       set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
#endif

   /* Enable PSE if available(Page Size Extensions)4MB页面表 */
   if (cpu_has_pse) {
       set_in_cr4(X86_CR4_PSE);
   }

   /* Enable PGE if available (PTE Global Bit)*/
   if (cpu_has_pge) {
       set_in_cr4(X86_CR4_PGE);
       __PAGE_KERNEL |= _PAGE_GLOBAL;
       __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
   }

   kernel_physical_mapping_init(pgd_base);    /* 系统空间映射(0xC0000000..=>0-max_low_pfn) */
   remap_numa_kva(); /* 重新初始化numa的内核虚拟地址空间???? */

   /*
    * Fixed mappings, only the page table structure has to be
    * created - mappings will be set by set_fixmap(): 固定使用的地址
    */
   vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; /* 在内存高地址区域 */
   page_table_range_init(vaddr, 0, pgd_base); /* 页面表固定地址初始化,包括acpi地址等 */

   permanent_kmaps_init(pgd_base); /* 固定地址初始化(pkmap),此地址干什么用?????是不是用作highmem分配使用 */

#ifdef CONFIG_X86_PAE
   /*
    * Add low memory identity-mappings - SMP needs it when
    * starting up on an AP from real-mode. In the non-PAE
    * case we already have these mappings through head.S.
    * All user-space mappings are explicitly cleared after
    * SMP startup.
    */
   pgd_base[0] = pgd_base[USER_PTRS_PER_PGD];
#endif
}

/*
* This maps the physical memory to kernel virtual address space, a total
* of max_low_pfn pages, by creating page tables starting from address
* PAGE_OFFSET.(映射物理内存到系统空间虚拟地址,共max_low_pfn页面,从0xc0000000地址开始)
*/
static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
{
   unsigned long pfn;
   pgd_t *pgd;
   pmd_t *pmd;
   pte_t *pte;
   int pgd_idx, pmd_idx, pte_ofs;

   pgd_idx = pgd_index(PAGE_OFFSET);    /* 映射开始地址是系统空间 */
   pgd = pgd_base + pgd_idx;
   pfn = 0;

   for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
       pmd = one_md_table_init(pgd);    /* 初始化二级目录表 */
       if (pfn >= max_low_pfn)
           continue;
       for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) {
           unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;

           /* Map with big pages if possible, otherwise create normal page tables. */
           if (cpu_has_pse) {    /* 4MB页面表初始化,如果用此,将没有第三级页面 */
               unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;

               if (is_kernel_text(address) || is_kernel_text(address2))
                   set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
               else
                   set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
               pfn += PTRS_PER_PTE;
           } else {
               pte = one_page_table_init(pmd);

               for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) {
                       if (is_kernel_text(address))
                           set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
                       else
                           set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
               }
           }
       }
   }
}


void __init sched_init(void)
{
   runqueue_t *rq;
   int i, j, k;
   /* 初始化每个cpu运行队列 */
   for (i = 0; i < NR_CPUS; i++) {
       prio_array_t *array;

       rq = cpu_rq(i);
       spin_lock_init(&rq->lock);
       rq->active = rq->arrays; /* 活动队列 */
       rq->expired = rq->arrays + 1; /* 过期队列 */
       rq->best_expired_prio = MAX_PRIO; /* 优先级最低 */

#ifdef CONFIG_SMP
       rq->sd = &sched_domain_dummy;
       rq->cpu_load = 0;    /* cpu负载 */
       rq->active_balance = 0;    /* ???? */
       rq->push_cpu = 0;    /* ???? */
       rq->migration_thread = NULL;
       INIT_LIST_HEAD(&rq->migration_queue);
#endif
       atomic_set(&rq->nr_iowait, 0);

       for (j = 0; j < 2; j++) {
           array = rq->arrays + j;
           for (k = 0; k < MAX_PRIO; k++) {
               INIT_LIST_HEAD(array->queue + k);
               __clear_bit(k, array->bitmap);
           }
           // delimiter for bitsearch
           __set_bit(MAX_PRIO, array->bitmap);
       }
   }

   /*
    * The boot idle thread does lazy MMU switching as well:
    */
   atomic_inc(&init_mm.mm_count);
   enter_lazy_tlb(&init_mm, current);

   /*
    * Make us the idle thread. Technically, schedule() should not be
    * called from this thread, however somewhere below it might be,
    * but because we are the idle thread, we just pick up running again
    * when this runqueue becomes "idle".
    */
   init_idle(current, smp_processor_id()); /* 设置idel进程,并将runqueue中curr指向该进程 */
}

void __init trap_init(void)    /* 中断向量重新设置(在初始化时设置指向ignore_int) */
{
#ifdef CONFIG_EISA
   if (isa_readl(0x0FFFD9) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) {
       EISA_bus = 1;
   }
#endif

#ifdef CONFIG_X86_LOCAL_APIC
   init_apic_mappings();
#endif

   set_trap_gate(0,÷_error);    /* 陷阱门设置 */
   set_intr_gate(1,&debug);    /* 中断门设置 */
   set_intr_gate(2,&nmi);
   set_system_intr_gate(3, &int3); /* int3-5 can be called from all */
   set_system_gate(4,&overflow);
   set_system_gate(5,&bounds);
   set_trap_gate(6,&invalid_op);
   set_trap_gate(7,&device_not_available);
   set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS);
   set_trap_gate(9,&coprocessor_segment_overrun);
   set_trap_gate(10,&invalid_TSS);
   set_trap_gate(11,&segment_not_present);
   set_trap_gate(12,&stack_segment);
   set_trap_gate(13,&general_protection);
   set_intr_gate(14,&page_fault);
   set_trap_gate(15,&spurious_interrupt_bug);
   set_trap_gate(16,&coprocessor_error);
   set_trap_gate(17,&alignment_check);
#ifdef CONFIG_X86_MCE
   set_trap_gate(18,&machine_check);
#endif
   set_trap_gate(19,&simd_coprocessor_error);

   set_system_gate(SYSCALL_VECTOR,&system_call);    /* 系统调用中断设置 */

   /*
    * Should be a barrier for any external CPU state.
    */
   cpu_init();    /* 重新装入gdt,ldt */

   trap_init_hook(); /* do nothing on i386 */
}

void __init init_IRQ(void)
{
   int i;

   /* all the set up before the call gates are initialised */
   pre_intr_init_hook();    /* 中断请求队列初始化 */

   /*
    * Cover the whole vector space, no vector can escape    设置中断向量
    * us. (some of these will be overridden and become
    * 'special' SMP interrupts)
    */
   for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
       int vector = FIRST_EXTERNAL_VECTOR + i;
       if (i >= NR_IRQS)
           break;
       if (vector != SYSCALL_VECTOR)
           set_intr_gate(vector, interrupt);
   }

   /* setup after call gates are initialised (usually add in
    * the architecture specific gates) 在系统调用初始化完毕后特殊设置,和结构相关
    */
   intr_init_hook();

   /*
    * Set the clock to HZ Hz, we already have a valid
    * vector now: 设置时钟hz
    */
   setup_pit_timer();

   /*
    * External FPU? Set up irq13 if so, for
    * original braindamaged IBM FERR coupling.
    */
   if (boot_cpu_data.hard_math && !cpu_has_fpu)
       setup_irq(FPU_IRQ, &fpu_irq);

   irq_ctx_init(smp_processor_id());
}


void __init mem_init(void)
{
   extern int ppro_with_ram_bug(void);    /* 检测pentium是否是有bug的cpu */
   int codesize, reservedpages, datasize, initsize;
   int tmp;
   int bad_ppro;

#ifndef CONFIG_DISCONTIGMEM
   if (!mem_map)
       BUG();
#endif
   
   bad_ppro = ppro_with_ram_bug();

#ifdef CONFIG_HIGHMEM
   /* check that fixmap and pkmap do not overlap 确认fixmap和pkmap没有重叠 */
   if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
       printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n");
       printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
               PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START);
       BUG();
   }
#endif

   set_max_mapnr_init();    /* 设置highmem区域 */

#ifdef CONFIG_HIGHMEM
   high_memory = (void *) __va(highstart_pfn * PAGE_SIZE);
#else
   high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
#endif

   /* this will put all low memory onto the freelists,根据页面位图释放内存中所有可供动态分配的页面 */
   totalram_pages += __free_all_bootmem();

   reservedpages = 0;
   for (tmp = 0; tmp < max_low_pfn; tmp++)
       /*
        * Only count reserved RAM pages
        */
       if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
           reservedpages++;

   set_highmem_pages_init(bad_ppro);

   codesize = (unsigned long) &_etext - (unsigned long) &_text;
   datasize = (unsigned long) &_edata - (unsigned long) &_etext;
   initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;

   kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);    /* 初始化kcore_mem,应该是实际内存? */
   kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
          VMALLOC_END-VMALLOC_START);    /* 虚拟内存初始化 */

   printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
       (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
       num_physpages << (PAGE_SHIFT-10),
       codesize >> 10,
       reservedpages << (PAGE_SHIFT-10),
       datasize >> 10,
       initsize >> 10,
       (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
        );

#ifdef CONFIG_X86_PAE
   if (!cpu_has_pae)
       panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
#endif
   if (boot_cpu_data.wp_works_ok < 0)
       test_wp_bit();

   /*
    * Subtle. SMP is doing it's boot stuff late (because it has to
    * fork idle threads) - but it also needs low mappings for the
    * protected-mode entry to work. We zap these entries only after
    * the WP-bit has been tested.
    */
#ifndef CONFIG_SMP
   zap_low_mappings();
#endif
}


/* Initialisation.
* Called after the gfp() functions have been enabled, and before smp_init().
*/
void __init kmem_cache_init(void)
{
   size_t left_over;
   struct cache_sizes *sizes;
   struct cache_names *names;

   /*
    * Fragmentation(分裂) resistance(阻力) on low memory - only use bigger
    * page orders on machines with more than 32MB of memory.
    */
   if (num_physpages > (32 << 20) >> PAGE_SHIFT)    /* 系统有多于32MB内存 */
       slab_break_gfp_order = BREAK_GFP_ORDER_HI;

   
   /* Bootstrap is tricky, because several objects are allocated
    * from caches that do not exist yet:
    * 1) initialize the cache_cache cache: it contains the kmem_cache_t
    *   structures of all caches, except cache_cache itself: cache_cache
    *   is statically allocated.
    *   Initially an __init data area is used for the head array, it's
    *   replaced with a kmalloc allocated array at the end of the bootstrap.
    * 2) Create the first kmalloc cache.
    *   The kmem_cache_t for the new cache is allocated normally. An __init
    *   data area is used for the head array.
    * 3) Create the remaining kmalloc caches, with minimally sized head arrays.
    * 4) Replace the __init data head arrays for cache_cache and the first
    *   kmalloc cache with kmalloc allocated arrays.
    * 5) Resize the head arrays of the kmalloc caches to their final sizes.
    */

   /* 1) create the cache_cache */
   init_MUTEX(&cache_chain_sem);    /* 初始化cache链表信号量 */
   INIT_LIST_HEAD(&cache_chain);    /* 初始化cache链表 */
   list_add(&cache_cache.next, &cache_chain);    /* 是不是把自己加入到队列头???? */
   cache_cache.colour_off = cache_line_size();    /* 128 */
   cache_cache.array[smp_processor_id()] = &initarray_cache.cache;

   cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());

   cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
               &left_over, &cache_cache.num);
   if (!cache_cache.num)
       BUG();

   cache_cache.colour = left_over/cache_cache.colour_off;
   cache_cache.colour_next = 0;
   cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) +
               sizeof(struct slab), cache_line_size());

   /* 2+3) create the kmalloc caches */
   sizes = malloc_sizes;
   names = cache_names;

   while (sizes->cs_size) {
       /* For performance, all the general caches are L1 aligned.
        * This should be particularly beneficial on SMP boxes, as it
        * eliminates "false sharing".
        * Note for systems short on memory removing the alignment will
        * allow tighter packing of the smaller caches. */
       sizes->cs_cachep = kmem_cache_create(names->name,
           sizes->cs_size, ARCH_KMALLOC_MINALIGN,
           (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);

       /* Inc off-slab bufctl limit until the ceiling is hit. */
       if (!(OFF_SLAB(sizes->cs_cachep))) {
           offslab_limit = sizes->cs_size-sizeof(struct slab);
           offslab_limit /= sizeof(kmem_bufctl_t);
       }

       sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
           sizes->cs_size, ARCH_KMALLOC_MINALIGN,
           (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC),
           NULL, NULL);

       sizes++;
       names++;
   }
   /* 4) Replace the bootstrap head arrays */
   {
       void * ptr;
       
       ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
       local_irq_disable();
       BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
       memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init));
       cache_cache.array[smp_processor_id()] = ptr;
       local_irq_enable();
   
       ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
       local_irq_disable();
       BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache);
       memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep),
               sizeof(struct arraycache_init));
       malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr;
       local_irq_enable();
   }

   /* 5) resize the head arrays to their final sizes */
   {
       kmem_cache_t *cachep;
       down(&cache_chain_sem);
       list_for_each_entry(cachep, &cache_chain, next)
           enable_cpucache(cachep);    /* 激活cpu缓存 */
       up(&cache_chain_sem);
   }

   /* Done! */
   g_cpucache_up = FULL;

   /* Register a cpu startup notifier callback
    * that initializes ac_data for all new cpus
    */
   register_cpu_notifier(&cpucache_notifier);
   

   /* The reap timers are started later, with a module init call:
    * That part of the kernel is not yet operational.
    */
}

void __init pidmap_init(void)
{
   int i;

   pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL);
   set_bit(0, pidmap_array->page);
   atomic_dec(&pidmap_array->nr_free);

   /*
    * Allocate PID 0, and hash it via all PID types:
    */

   for (i = 0; i < PIDTYPE_MAX; i++)    /* 将当前进程加入到hash表中.pid,pgid,tgid,sid */
       attach_pid(current, i, 0);
}

/*
* We need to finalize in a non-__init function or else race conditions
* between the root thread and the init thread may cause start_kernel to
* be reaped by free_initmem before the root thread has proceeded to
* cpu_idle.
*
* gcc-3.4 accidentally inlines this function, so use noinline.
*/

static void noinline rest_init(void)
   __releases(kernel_lock)
{
   kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND); /* 启动init内核进程 */
   numa_default_policy();
   unlock_kernel();
   cpu_idle();
}

static int init(void * unused)
{
   lock_kernel();
   /*
    * Tell the world that we're going to be the grim
    * reaper of innocent orphaned children. 所有进程的父进程
    *
    * We don't want people to have to make incorrect
    * assumptions about where in the task array this
    * can be found.
    */
   child_reaper = current;

   /* Sets up cpus_possible() */
   smp_prepare_cpus(max_cpus); /*主cpu会依次启动各个从cpu。见smp_boot_cpus->do_boot_cpu()*/

   do_pre_smp_initcalls();    /* 启动migration_thread,ksoftirqd等CPU进程 */

   fixup_cpu_present_map();
   smp_init();    /* 主要设置APIC */
   sched_init_smp();

   /*
    * Do this before initcalls, because some drivers want to access
    * firmware files.
    */
   populate_rootfs();    /* 生成initrd文件 */

   do_basic_setup();

   /*
    * check if there is an early userspace init. If yes, let it do all
    * the work
    */
   if (sys_access((const char __user *) "/init", 0) == 0)
       execute_command = "/init";
   else
       prepare_namespace();    /* 装载initrd,安装模块,mount根文件系统 */

   /*
    * Ok, we have completed the initial bootup, and
    * we're essentially up and running. Get rid of the
    * initmem segments and start the user-mode stuff..
    */
   free_initmem();
   unlock_kernel();
   system_state = SYSTEM_RUNNING;
   numa_default_policy();

   if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
       printk("Warning: unable to open an initial console.\n");

   (void) sys_dup(0);
   (void) sys_dup(0);
   
   /*
    * We try each of these until one succeeds.
    *
    * The Bourne shell can be used instead of init if we are
    * trying to recover a really broken machine.
    */

   if (execute_command)
       run_init_process(execute_command);

   run_init_process("/sbin/init");
   run_init_process("/etc/init");
   run_init_process("/bin/init");
   run_init_process("/bin/sh");

   panic("No init found. Try passing init= option to kernel.");
}