zImageのロード&展開調査/ARM (2) - まだ見えない先の

zImageのロード&展開調査/ARM (1) - まだ見えない先のの続き
前回は，自己解凍からImage呼び出しまでの概要．
今回は，その他細かい話．
@ I cacheと D cacheを有効にするためにページテーブルを生成する
/*
 * Turn on the cache.  We need to setup some page tables so that we
 * can have both the I and D caches on.
 *
 * We place the page tables 16k down from the kernel execution address,
 * and we hope that nothing else is using it.  If we're using it, we
 * will go pop!                                                                                                                                                                                                                     
 *
 * On entry,
 *  r4 = kernel execution address
 *  r6 = processor ID
 *  r7 = architecture number
 *  r8 = atags pointer
 *  r9 = run-time address of "start"  (???)
 * On exit,
 *  r1, r2, r3, r9, r10, r12 corrupted
 * This routine must preserve:
 *  r4, r5, r6, r7, r8
 */
                .align  5
@ call_cache_fnはアーキテクチャ依存関数を呼び出す共通ルーチンで，r3に入る値によって，どの種別の関数を呼び出すかを決定する．
@ 8の場合は，cache_on
cache_on:       mov     r3, #8                  @ cache_on function
                b       call_cache_fn

...

@ 仮想アドレス>>18+ページテーブル先頭アドレス->対応する物理アドレスが格納されたエントリ
@ r3に，r4(zreladdr:展開後のカーネルが展開されるアドレス)の前方向に2^14(16KB)バイト分のページテーブルを確保
@ 16KBはL1ページテーブルが使用するサイズ(ページエントリ数:4096 x エントリサイズ:4Byte)
@ このページテーブルが使用する領域がzImageと重なっていると，上書きされまずいことになるがチェックはしていない
__setup_mmu:
                sub     r3, r4, #16384          @ Page directory size
@ さらにアライメントによって前方領域を確保．bic(bit clear)で下位14bitをクリア(即値の都合上1回ではできないので2回に分けている)
                bic     r3, r3, #0xff           @ Align the pointer
                bic     r3, r3, #0x3f00
/*
 * Initialise the page tables, turning on the cacheable and bufferable
 * bits for the RAM area only.
 */
                mov     r0, r3
@  (r9 & !0x0003FFFF)
                mov     r9, r0, lsr #18
                mov     r9, r9, lsl #18         @ start of RAM //物理アドレス=仮想アドレス(ダイレクトマッピング)

@ r9を(256MB)分加算し，r10へ
                add     r10, r9, #0x10000000    @ a reasonable RAM size

@ r1=0x12(0001,0010)
                mov     r1, #0x12               @ L1ページテーブルにて
                                                @  セクションエントリ
                                                @  Cache:off,Buffer:off
                                                @ を意味する
@ r1=0xc12(1100,0001,0010)
                orr     r1, r1, #3 << 10        @ Access Permission:特権モード/ユーザモード共に読み出し・書き込み可能
                add     r2, r3, #16384          @ r2:ページテーブルの終端

@ 4GBの仮想空間中で，ページテーブルが存在する(=r9を含む)領域を先頭に，256MB分の領域のキャッシュとバッファを有効にする
@ この領域には，展開後のカーネルがロードされる領域も含まれる
1:              cmp     r1, r9                  @ if virt > start of RAM
                orrhs   r1, r1, #0x0c           @ set cacheable, bufferable
                cmp     r1, r10                 @ if virt > end of RAM
                bichs   r1, r1, #0x0c           @ clear cacheable, bufferable
                str     r1, [r0], #4            @ 1:1 mapping

@ r1へ1MB分加算
                add     r1, r1, #1048576        @ L1ページテーブルは1MBを1ページとする
                                                @ セクションエントリ中のベースアドレス部分に1加算
@ ページテーブル終端まで繰り返す                                                                                                                                                                                                      
                teq     r0, r2                  @ 1ページ1MBかつ4096エントリ処理するので4GBをマッピング
                bne     1b

@ 現在実行中のコード部分(PCレジスタが存在する部分/zImageが存在する部分)のキャッシュ，バッファを有効にする
/*
 * If ever we are running from Flash, then we surely want the cache
 * to be enabled also for our execution instance...  We map 2MB of it
 * so there is no map overlap problem for up to 1 MB compressed kernel.
 * If the execution is in RAM then we would only be duplicating the above.
 */

                mov     r1, #0x1e               @ 00011110=セクションエントリ,Cache:on,Buffer:on
                orr     r1, r1, #3 << 10        @ Access Permission:特権モード/ユーザモード共に読み出し・書き込み可能
                mov     r2, pc, lsr #20         @ pcを20bit(1MB)右シフト(エントリ内ベースアドレス(物理アドレス)は12bitのため切り捨て，1ページあたり1MBのためでもある)
                orr     r1, r1, r2, lsl #20     @ ベースアドレス(r2)を20bit左シフトし，属性(r1)との論理和をエントリとする
                add     r0, r3, r2, lsl #2      @ ページテーブル先頭(r3)にr2を2bit左シフトしたもの(=pcを18bit右シフトしたもの)を加算しr0へ
                                                @ r0は，(仮想アドレス空間として考えた場合の)pcが存在するページに対応するページテーブルエントリを指す
                str     r1, [r0], #4            @ r0にエントリをストア後，r0を4バイト(1エントリ分)インクリメント
                add     r1, r1, #1048576        @ 次の1MBもマッピング
                str     r1, [r0]

@ bl __setup_mmuの次の命令にジャンプ (bl命令はジャンプ時にlrレジスタに次のアドレスを格納する)
                mov     pc, lr

__armv4_mmu_cache_on:
@ 次のblでlrレジスタの値が変わるため，(前回のbl cache_on時に格納された)戻りアドレスをr12へ格納
                mov     r12, lr
                bl      __setup_mmu
                mov     r0, #0

@ MMU起動前の儀式...(以下，適宜追加予定)
                mcr     p15, 0, r0, c7, c10, 4  @ drain write buffer
                mcr     p15, 0, r0, c8, c7, 0   @ flush I,D TLBs
                mrc     p15, 0, r0, c1, c0, 0   @ read control reg
                orr     r0, r0, #0x5000         @ I-cache enable, RR cache replacement
                orr     r0, r0, #0x0030

                bl      __common_mmu_cache_on
                mov     r0, #0
                mcr     p15, 0, r0, c8, c7, 0   @ flush I,D TLBs

@ bl cache_onの次の命令にジャンプ
                mov     pc, r12

...

__common_mmu_cache_on:
...
                orr     r0, r0, #0x000d         @ Write buffer, mmu
...
                mov     r1, #-1
@ ページテーブルアドレスの登録
                mcr     p15, 0, r3, c2, c0, 0   @ load page table pointer

                mcr     p15, 0, r1, c3, c0, 0   @ load domain access control
                b       1f
                .align  5                       @ cache line aligned
1:              mcr     p15, 0, r0, c1, c0, 0   @ load control register
                mrc     p15, 0, r0, c1, c0, 0   @ and read it back to

                sub     pc, lr, r0, lsr #32     @ properly flush pipeline

/*
 * All code following this line is relocatable.  It is relocated by
 * the above code to the end of the decompressed kernel image and
 * executed there.  During this time, we have no stacks.                                                                                                                                                                            
 *
 * r0     = decompressed kernel length
 * r1-r3  = unused
 * r4     = kernel execution address
 * r5     = decompressed kernel start
 * r6     = processor ID
 * r7     = architecture ID
 * r8     = atags pointer
 * r9-r14 = corrupted
 */
                .align  5
reloc_start:    add     r9, r5, r0
                sub     r9, r9, #128            @ do not copy the stack ((decompress_kernelの返り値r0+128+127)&127されたr0なので)
...
@ 本来はr4に配置したい所をr5に展開しているので，r4へ再配置
                mov     r1, r4
下記の方法が効率のよい理由?
r5からr1へコピー
1:
                .rept   4
                ldmia   r5!, {r0, r2, r3, r10 - r14}    @ relocate kernel
                stmia   r1!, {r0, r2, r3, r10 - r14}
                .endr

                cmp     r5, r9
                blo     1b
@ r1はインクリメントされ，カーネルの終端を指している
@ カーネルの終端から128byte先をスタックの底とする
                add     sp, r1, #128            @ relocate the stack
...

call_kernel:    bl      cache_clean_flush
                bl      cache_off

@ Image(カーネルイメージ)が必要とする引数
                mov     r0, #0                  @ must be zero
                mov     r1, r7                  @ restore architecture number
                mov     r2, r8                  @ restore atags pointer
                mov     pc, r4                  @ call kernel


/*
 * Here follow the relocatable cache support functions for the
 * various processors.  This is a generic hook for locating an
 * entry and jumping to an instruction at the specified offset
 * from the start of the block.  Please note this is all position
 * independent code.
 *
 *  r1  = corrupted                                                                                                                                                                                                                 
 *  r2  = corrupted
 *  r3  = block offset
 *  r6  = corrupted
 *  r12 = corrupted
 */

call_cache_fn:  adr     r12, proc_types
...
@ r6にprocessor ID(real_id)を入れる
                mrc     p15, 0, r6, c0, c0      @ get processor ID
...
1:
@ (proc_types)エントリの先頭にあるCPU ID matchをr1へ,CPU ID maskをr2へ
                ldr     r1, [r12, #0]           @ get value
                ldr     r2, [r12, #4]           @ get mask

@ ((real_id ^ match) & mask) == 0の場合，CPUが一致したと判断し，そのエントリのr3バイト目へジャンプ
@ call_cache_fnがcache_onから呼ばれた場合r3=8，cache_offの場合r3=12，cache_clean_flushの場合r3=16
                eor     r1, r1, r6              @ (real ^ match)
                tst     r1, r2                  @       & mask
                addeq   pc, r12, r3             @ call cache function

@ 次のエントリへ
                add     r12, r12, #4*5
                b       1b

/*
 * Table for cache operations.  This is basically:
 *   - CPU ID match
 *   - CPU ID mask
 *   - 'cache on' method instruction
 *   - 'cache off' method instruction
 *   - 'cache flush' method instruction
 *
 * We match an entry using: ((real_id ^ match) & mask) == 0
 *
 * Writethrough caches generally only need 'on' and 'off'
 * methods.  Writeback caches _must_ have the flush method
 * defined.
 */
                .type   proc_types,#object
proc_types:
...
                .word   0x00060000              @ ARMv5TEJ
                .word   0x000f0000
                b       __armv4_mmu_cache_on
                b       __armv4_mmu_cache_off
                b       __armv4_mmu_cache_flush

                .word   0x0007b000              @ ARMv6
                .word   0x0007f000                                                                                                                                                                                                  
                b       __armv4_mmu_cache_on
                b       __armv4_mmu_cache_off
                b       __armv6_mmu_cache_flush

                .word   0                       @ unrecognised type
                .word   0
                mov     pc, lr
                mov     pc, lr
                mov     pc, lr

                .size   proc_types, . - proc_types

/*
 * Turn off the Cache and MMU.  ARMv3 does not support
 * reading the control register, but ARMv4 does.
 *
 * On entry,  r6 = processor ID
 * On exit,   r0, r1, r2, r3, r12 corrupted
 * This routine must preserve: r4, r6, r7
 */
                .align  5
cache_off:      mov     r3, #12                 @ cache_off function
                b       call_cache_fn

...
__armv4_mmu_cache_off:
                mrc     p15, 0, r0, c1, c0
                bic     r0, r0, #0x000d
                mcr     p15, 0, r0, c1, c0      @ turn MMU and cache off
                mov     r0, #0
                mcr     p15, 0, r0, c7, c7      @ invalidate whole cache v4
                mcr     p15, 0, r0, c8, c7      @ invalidate whole TLB v4
                mov     pc, lr

...

/*
 * Clean and flush the cache to maintain consistency.
 *
 * On entry,
 *  r6 = processor ID
 * On exit,
 *  r1, r2, r3, r11, r12 corrupted
 * This routine must preserve:
 *  r0, r4, r5, r6, r7
 */
                .align  5
cache_clean_flush:
                mov     r3, #16
                b       call_cache_fn

...
__armv4_mmu_cache_flush:
                mov     r2, #64*1024            @ default: 32K dcache size (*2)
                mov     r11, #32                @ default: 32 byte line size
                mrc     p15, 0, r3, c0, c0, 1   @ read cache type
                teq     r3, r6                  @ cache ID register present?
                beq     no_cache_id
                mov     r1, r3, lsr #18                                                                                                                                                                                             
                and     r1, r1, #7
                mov     r2, #1024
                mov     r2, r2, lsl r1          @ base dcache size *2
                tst     r3, #1 << 14            @ test M bit
                addne   r2, r2, r2, lsr #1      @ +1/2 size if M == 1
                mov     r3, r3, lsr #12
                and     r3, r3, #3
                mov     r11, #8
                mov     r11, r11, lsl r3        @ cache line size in bytes
no_cache_id:
                bic     r1, pc, #63             @ align to longest cache line
                add     r2, r1, r2
1:              ldr     r3, [r1], r11           @ s/w flush D cache
                teq     r1, r2
                bne     1b

                mcr     p15, 0, r1, c7, c5, 0   @ flush I cache
                mcr     p15, 0, r1, c7, c6, 0   @ flush D cache
                mcr     p15, 0, r1, c7, c10, 4  @ drain WB
                mov     pc, lr

...

                .ltorg
reloc_end:

                .align
                .section ".stack", "w"
user_stack:     .space  4096