在ARM汇编中优化块副本

问题描述 投票:1回答:1

我正在尝试优化内存使用和时间的块循环,其中将寄存器中的值除以2,然后再乘以5,但是我得到的代码包含许多分支指令,我想知道是否在加载8个寄存器之后,有一种方法可以将它们一次全部移入寄存器,以限制多个分支或一条指令可以同时在同一行中完成它们?

 THUMB
 AREA RESET, CODE, READONLY
 EXPORT  __Vectors
 EXPORT Reset_Handler
__Vectors 
 DCD 0x00180000     ; top of the stack 
  DCD Reset_Handler  ; reset vector - where the program starts

  AREA Task2b_Code, CODE, READONLY
  Reset_Handler
  ENTRY

  num_words EQU (end_source-source)/4  ; number of words to copy

  start 
  LDR r0,=source     ; point to the start of the area of memory to copy from
  LDR r1,=dest       ; point to the start of the area of memory to copy to
  MOV r2,#num_words  ; get the number of words to copy

  ; find out how many blocks of 8 words need to be copied - it is assumed
  ; that it is faster to load 8 data items at a time, rather than load
  ; individually
block
  MOVS r3,r2,LSR #3  ; find the number of blocks of 8 words
  BEQ individ        ; if no blocks to copy, just copy individual words

  ; copy and process blocks of 8 words 
block_loop
  LDMIA r0!,{r5-r12}  ; get 8 words to copy as a block
  MOV r4,r5           ; get first item
  BL data_processing  ; process first item 
  MOV r5,r4           ; keep first item
  MOV r4,r6           ; get second item
  BL data_processing  ; process second item 
  MOV r6,r4           ; keep second item
  MOV r4,r7           ; get third item
  BL data_processing  ; process third item
  MOV r7,r4           ; keep third item  
  MOV r4,r8           ; get fourth item
  BL data_processing  ; process fourth item 
  MOV r8,r4           ; keep fourth item
  MOV r4,r9           ; get fifth item
  BL data_processing  ; process fifth item
  MOV r9,r4           ; keep fifth item  
  MOV r4,r10          ; get sixth item
  BL data_processing  ; process sixth item 
  MOV r10,r4          ; keep sixth item
  MOV r4,r11          ; get seventh item
  BL data_processing  ; process seventh item
  MOV r11,r4          ; keep seventh item 
  MOV r4,r12          ; get eighth item
  BL data_processing  ; process eighth item
  MOV r12,r4          ; keep eighth item  
  STMIA r1!,{r5-r12}  ; copy the 8 words 
  SUBS r3,r3,#1       ; move on to the next block
  BNE block_loop      ; continue until last block reached

  ; there may now be some data items available (fewer than 8)
  ; find out how many of these individual words need to be copied 
individ
  ANDS r3,r2,#7   ; find the number of words that remain to copy individually
  BEQ exit        ; skip individual copying if none remains

  ; copy the excess of words
individ_loop
  LDR r4,[r0],#4      ; get next word to copy
  BL data_processing  ; process the item read
  STR r4,[r1],#4      ; copy the word 
  SUBS r3,r3,#1       ; move on to the next word
  BNE individ_loop    ; continue until the last word reached

  ; languish in an endless loop once all is done
exit    
  B exit

  ; subroutine to scale a value by 0.5 and then saturate values to a maximum of 5 
data_processing
  CMP r4,#10           ; check whether saturation is needed
  BLT divide_by_two    ; if not, just divide by 2
  MOV r4,#5            ; saturate to 5
  BX lr
divide_by_two
  MOV r4,r4,LSR #1     ; perform scaling
  BX lr

  AREA Task2b_ROData, DATA, READONLY
source  ; some data to copy
  DCD 1,2,3,4,5,6,7,8,9,10,11,0,4,6,12,15,13,8,5,4,3,2,1,6,23,11,9,10 
end_source

  AREA Task2b_RWData, DATA, READWRITE
dest  ; copy to this area of memory
  SPACE end_source-source
end_dest
  END
performance assembly optimization arm micro-optimization
1个回答
0
投票

使用此简单代码

area blockCopy, code
 entry
 ldr r0,= 0x40000000
 ldr r1,= 0x40000020
 mov r4,#5

up ldr r3,[r0],#4
 str r3,[r1],#4
 subs r4,r4,#1
 BNE up

stop b stop
 end
© www.soinside.com 2019 - 2024. All rights reserved.