Introduction

This documents my investigation into assembly language programming on the ARM processor running my SheevaPlug. This runs Debian GNU/Linux. I use the GNU assembler, gas, the GNU linker, ld and the GNU debugger, gdb.

I referred to GNU-ARM-Assy-Quick-Ref.pdf and More Assembler Directives. The instruction set is defined here

More links are here, here and here

For most of the examples I followed those supplied with the excellent X86 assembly language book,Professional Assembly Language by Richard Blum.

The code is at git@github.com:bobblestiltskin/professional_assembly_language.git if you want to play with it.

Getting Started

  1. Familiarity with gdb is useful.
  2. This explains the basic usage well.

  3. Communicating with Linux
  4. We can invoke a system call to the operating system, which allows us to pass back one parameter, as the exit code of the program.

    
    
    bob@poland:~/src/asm$ /usr/bin/as -gstabs -o syscall.o syscall.s
    bob@poland:~/src/asm$ /usr/bin/ld -o syscall syscall.o
    bob@poland:~/src/asm$ ./syscall 
    bob@poland:~/src/asm$ echo $?
    42
    bob@poland:~/src/asm$
    
  5. The size of an executable
  6. The following 3 programs show the size allocated for uninitialised data.

    # sizetest1.s  A sample program to view the executable size
    .section .text
    .globl _start
    _start:
    	mov	r0, #0		@ set exit code to 0
    	mov     r7, #1		@ set r7 to 1 - the syscall for exit
    	swi     0		@ then invoke the syscall from linux
    
    
    # sizetest2.s - A sample program to view the executable size
    .section .bss
       .lcomm buffer, 10000
    .section .text
    .globl _start
    _start:
            mov     r0, #0          @ set exit code to 0
            mov     r7, #1          @ set r7 to 1 - the syscall for exit
            swi     0               @ then invoke the syscall from linux
    
    
    # sizetest3.s - A sample program to view the executable size
    .section .data
    buffer:
       .fill 10000
    .section .text
    .globl _start
    _start:
            mov     r0, #0          @ set exit code to 0
            mov     r7, #1          @ set r7 to 1 - the syscall for exit
            swi     0               @ then invoke the syscall from linux
    
    
    bob@poland:~/www/examples$make sizetest1 sizetest2 sizetest3
    /usr/bin/as -gstabs -o sizetest1.o sizetest1.s
    /usr/bin/ld -o sizetest1 sizetest1.o
    /usr/bin/as -gstabs -o sizetest2.o sizetest2.s
    /usr/bin/ld -o sizetest2 sizetest2.o
    /usr/bin/as -gstabs -o sizetest3.o sizetest3.s
    /usr/bin/ld -o sizetest3 sizetest3.o
    bob@poland:~/www/examples$ls -l sizetest?
    -rwxr-xr-x 1 bob bob   938 Jul  3 16:17 sizetest1
    -rwxr-xr-x 1 bob bob  1072 Jul  3 16:17 sizetest2
    -rwxr-xr-x 1 bob bob 11072 Jul  3 16:17 sizetest3
    bob@poland:~/www/examples$
    
  7. Communicating via C functions
  8. We can call C functions and pass parameters to them.

    
    
    Note that we change _start to main since we use gcc to assemble and link because we want to link in the C library so that we can access printf.
    bob@poland:~/src/asm$ gcc -o printf printf.s
    bob@poland:~/src/asm$ ./printf 
    Sum of 1 and 41 is 42
    bob@poland:~/src/asm$
    

Moving Data

  1. An example of moving data from memory to a register
  2. I write a makefile to build the software and use rodata (for my read-only data).

    # movtest1.s
    @ An example of moving data from memory to a register
            .global _start
    
    .section .rodata
    value: .byte 42
    
            .text
    _start:
    	ldr	r1, =value
    	ldrb	r0, [r1], #1	@ load the byte at address r1 to r0
    	mov     r7, #1		@ set r7 to 1 - the syscall for exit
    	swi     0		@ then invoke the syscall from linux
    
    
    bob@poland:~/www/examples$ make movtest1
    /usr/bin/as -gstabs -o movtest1.o movtest1.s
    /usr/bin/ld -o movtest1 movtest1.o
    bob@poland:~/www/examples$ ./movtest1 
    bob@poland:~/www/examples$ echo $?
    42
    bob@poland:~/www/examples$
    
  3. An example of moving register data to memory
  4. Read a value to r0, and store it to memory. Double the value of r0 then read back to r0. Our return code is the same as that stored rather than the doubled version. By implication, our store and load were executed correctly.

    @ An example of moving register data to memory
            .global _start
    
    .section .data
    value: .byte 42
    .text
    _start:
            ldr	r1, =value
    	mov	r0, #9
            strb	r0, [r1], #0		@store the byte at address r1 to r0
    	add	r0, r0, r0
            ldrb    r0, [r1], #0    	@ load the byte at address r1 to r0
            mov     r7, #1			@ set r7 to 1 - the syscall for exit
            swi     0                       @ then invoke the syscall from linux
    
    
    
    bob@poland:~/www/examples$ make movtest2
    /usr/bin/as -gstabs -o movtest2.o movtest2.s
    /usr/bin/ld -o movtest2 movtest2.o
    bob@poland:~/www/examples$ ./movtest2 
    bob@poland:~/www/examples$ echo $?
    9
    bob@poland:~/www/examples$
    
  5. An example of using indexed memory locations
  6. The .equ directive is introduced which aliases a value to a name. We do so because we can easily change from using bytes for storage of the value to using half-words (of 2 bytes) or words (of 4 bytes) in the data section and modification of the ldrb instruction to ldrh or ldr respectively.

    Again we use gcc rather than as to assemble since we use the C function, printf.

    # movtest3.s - Another example of using indexed memory locations
    .equ datum_size,1
    .globl main
    .section .rodata
    output:
    	.asciz "The value is %d\n"
    values:
    	.byte 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60
    endvalues:
    .text
    main:
    	stmfd	sp!, {r5, r6, lr}	@ save the registers we use to the stack
    	ldr	r5, =endvalues
    	ldr	r6, =values
    loop:
    	ldrb	r1, [r6], #datum_size
    	ldr	r0, =output
    	bl 	printf
    	cmp	r6, r5
    	bne	loop
    
    	ldmfd	sp!, {r5, r6, pc}	@ restore registers before exit
            mov     r7, #1                  @ set r7 to 1 - the syscall for exit
            swi     0                       @ then invoke the syscall from linux
    
    
    
    bob@poland:~/www/examples$ make movtest3
    /usr/bin/gcc -gstabs -o movtest3 movtest3.s
    bob@poland:~/www/examples$ ./movtest3
    The value is 10
    The value is 15
    The value is 20
    The value is 25
    The value is 30
    The value is 35
    The value is 40
    The value is 45
    The value is 50
    The value is 55
    The value is 60
    bob@poland:~/www/examples$
    
  7. An example of indirect addressing
  8. We write a constant to the second element of the vector and then read and return it.

    # movtest4.s  An example of indirect addressing
    .equ datum_size,1
    .globl _start
    .section .data
    values:
       .byte 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60
    .text
    _start:
    	ldr	r4, =values
    	mov	r5, #100
    	strb	r5, [r4, #datum_size]!	@ write back the incremented data pointer
    	ldrb	r0, [r4]		@ load that value to r0 for return to OS
            mov     r7, #1			@ set r7 to 1 - the syscall for exit
            swi     0                       @ then invoke the syscall from linux
    
    
    
    bob@poland:~/www/examples$ make movtest4
    /usr/bin/as -gstabs -o movtest4.o movtest4.s
    /usr/bin/ld -o movtest4 movtest4.o
    bob@poland:~/www/examples$ ./movtest4 
    bob@poland:~/www/examples$ echo $?
    100
    bob@poland:~/www/examples$
    
  9. Converting endianness
  10. ARM v6 has a rev instruction to convert big- and little-endian values. A sheevaplug uses ARM v5TE so we have to do the inversion explicitly

    @ swaptest.s  Converting big-endian to little-endian and vice-versa
    @  In Arm V6 we can use the  rev instruction - but not supported here 
    
    .globl _start
    .section .data
    vstart:
    	.word 0x12345678
    .text
    _start:
    	ldr	r1, =vstart
    	ldr	r0, [r1]		@ load word to r0
    .if ARCH >= 6
    	rev	r0, r0
    .else
    	and	r2, r0, #0xff000000	@ load the top 2 bytes to r2
    	and	r3, r0, #0x00ff0000	@ load the next 2 bytes to r3
    	and	r4, r0, #0x0000ff00	@ load the next 2 bytes to r4
    	and	r5, r0, #0x000000ff	@ load the bottom 2 bytes to r5
    	mov	r0, r2, lsr #24		@ shift r2 bytes to bottom and store to r0
    	orr	r0, r3, lsr #8		@ or the remaining shifted data
    	orr	r0, r4, lsl #8
    	orr	r0, r5, lsl #24
    .endif
    _stop:
    	mov     r7, #1			@ set r7 to 1 - the syscall for exit
    	swi     0			@ then invoke the syscall from linux
    
    
    
    bob@poland:~/www/examples$ grep architecture /proc/cpuinfo
    CPU architecture: 5TE
    bob@poland:~/www/examples$
    bob@poland:~/www/examples$ gdb endian
    GNU gdb (GDB) 7.0.1-debian
    Copyright (C) 2009 Free Software Foundation, Inc.
    License GPLv3+: GNU GPL version 3 or later 
    This is free software: you are free to change and redistribute it.
    There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
    and "show warranty" for details.
    This GDB was configured as "arm-linux-gnueabi".
    For bug reporting instructions, please see:
    ...
    Reading symbols from /home/bob/www/examples/endian...done.
    (gdb) b _start
    Breakpoint 1 at 0x8078: file endian.s, line 10.
    (gdb) b _stop
    Breakpoint 2 at 0x80a0: file endian.s, line 21.
    (gdb) run
    Starting program: /home/bob/www/examples/endian 
    
    Breakpoint 1, _start () at endian.s:11
    11		ldr	r0, [r1]		@ load word to r0
    Current language:  auto
    The current source language is "auto; currently asm".
    (gdb) x/x &vstart
    0x100a8 :	0x12345678
    (gdb) cont
    Continuing.
    
    Breakpoint 2, _stop () at endian.s:22
    22		swi     0			@ then invoke the syscall from linux
    (gdb) p/x $r0
    $2 = 0x78563412
    (gdb) quit
    A debugging session is active.
    
    	Inferior 1 [process 17053] will be killed.
    
    Quit anyway? (y or n) y
    bob@poland:~/www/examples$
    

    So we see the inverted data in r0 when the program stops.

Structured Programs

  1. A subroutine to display a vector of bytes
  2. We abstract the program given above as movtest3.s as follows.

    The subroutine to print is given by

    # this subroutine prints vectors of bytes
    #
    # inputs
    #   r0 - start of vector
    #   r1 - number of elements to print
    #   r2 - pointer to start of string used to print each element
    #
    # no outputs
    #
    .globl _vprintb
    .equ datum_size,1
    _vprintb:
    	stmfd	sp!, {r4, r5, r6, lr}
    	mov	r4, r0
    	mov	r5, r1
    	mov	r6, r2
    vprintb_loop:
    	ldrb	r1, [r4], #datum_size
    	mov	r0, r6
    	bl 	printf
    	subs	r5, r5, #1
    	bne	vprintb_loop
    
    	ldmfd	sp!, {r4, r5, r6, pc}
    
    

    A simple test harness to use the subroutine is given by

    .equ datum_size,1
    .globl main
    .section .rodata
    output:
    	.asciz "The value is %d\n"
    values:
    	.byte 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60
    endvalues:
    .text
    main:
    	ldr	r0, =values
    	mov	r1, #11
    	ldr	r2, =output
    	bl	_vprintb
    
            mov     r7, #1                  @ set r7 to 1 - the syscall for exit
            swi     0                       @ then invoke the syscall from linux
    
    
    bob@poland:~/www/examples$ make test_vprintb
    /usr/bin/gcc -gstabs -o test_vprintb test_vprintb.s vprintb.s
    bob@poland:~/www/examples$ ./test_vprintb
    The value is 10
    The value is 15
    The value is 20
    The value is 25
    The value is 30
    The value is 35
    The value is 40
    The value is 45
    The value is 50
    The value is 55
    The value is 60
    bob@poland:~/www/examples$
    
  3. A subroutine to display a vector of words
  4. Start to change movtest3.s as follows.

    The subroutine to print is given by

    # this subroutine prints vectors of words
    #
    # inputs
    #   r0 - start of vector
    #   r1 - number of elements to print
    #   r2 - pointer to start of string used to print first element
    #   r3 - pointer to start of string used to print subsequent elements
    #
    # no outputs
    #
    .globl _vprintw
    .equ datum_size,4
    _vprintw:
    	stmfd	sp!, {r4, r5, r6, r7, lr}	@ save registers on the stack
    	cmp	r1, #0				@ exit if no elements
    	ble	last
    	mov	r4, r0				@ copy the parameters to locals
    	mov	r5, r1
    	mov	r6, r2
    	mov	r7, r3
    	ldr	r1, [r4], #datum_size		@ load first vector element to r0 and bump pointer
    	mov	r0, r6				@ address of first string to r0
    	bl 	printf				@ and print itt
    	subs	r5, r5, #1			@ decrement counter
    	beq	last				@ and fall out if zero
    vprintw_loop:
    	ldr	r1, [r4], #datum_size		@ load next vector item to r0 and bump pointer
    	mov	r0, r7				@ address of subsequent string to r0
    	bl 	printf				@ and print it
    	subs	r5, r5, #1			@ decrement counter
    	bne	vprintw_loop			@ and loop if non-zero
    last:
    
    	ldmfd	sp!, {r4, r5, r6, r7, pc}	@ restore registers from stack and return
    
    

    A simple test harness to use the subroutine is given by

    .equ datum_size,4
    .globl main
    .section .rodata
    first:
    	.asciz "Vector of words - values : %d"
    subsequent:
    	.asciz ", %d"
    final:
    	.asciz "\n"
    values:
    	.word 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60
    endvalues:
    .text
    main:
    	ldr	r0, =values
    	mov	r1, #11
    	ldr	r2, =first
    	ldr	r3, =subsequent
    	bl	_vprintw
    	ldr	r0, =final
    	bl	printf
    
            mov     r7, #1                  @ set r7 to 1 - the syscall for exit
            swi     0                       @ then invoke the syscall from linux
    
    
    bob@poland:~/www/examples$ make test_vprintw
    /usr/bin/gcc -gstabs -o test_vprintw test_vprintw.s vprintw.s
    bob@poland:~/www/examples$ ./test_vprintw
    Vector of words - values : 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60
    bob@poland:~/www/examples$
    

    A more complex example - a bubble sort

    A bubble sort function

    # this subroutine bubble sorts vectors of words
    #
    # inputs
    #   r0 - start of vector
    #   r1 - number of elements to sort
    #
    # no outputs
    #
    # locals
    #   r4 - current pointer
    #   r5 - inner counter
    #   r6 - keep_going flag
    #   r7 - first element
    #   r8 - second element
    
    .equ datum_size,4
    .globl _bubble
    .text
    _bubble:
            stmfd	sp!, {r4, r5, r6, r7, r8, lr}	@ save variables to stack
    	cmp	r1, #1				@ number of elements must be > 1
    	ble	end_outer			@ stop if nothing to do
    
            sub	r5, r1, #1			@ need n-1 comparisons
            mov	r4, r0				@ initialise current pointer
    	mov	r6, #0				@ this register set when we swap
    
    loop_start:
    	ldr	r7, [r4], #datum_size		@ load one element
    	ldr	r8, [r4]			@ and next one
    	cmp	r7, r8				@ compare them
    	ble	no_swap				@ branch if second greater
    
    	mov	r6, #1				@ set keep_going flag
    	sub	r4, r4, #datum_size		@ reset pointer to first element
    	swp	r8, r8, [r4]			@ exchange value in r8 and address in r4
    	str	r8, [r4, #datum_size]!		@ store new r8 to incremented address
    no_swap:
    	subs	r5, r5, #1			@ decrement counter
    	bne	loop_start			@ and restart loop if more needed
    
    end_inner:
    	cmp	r6, #0				@ check keep_going flag
    	beq	end_outer			@ and leave if not set
    
    	mov	r6, #0				@ clear keep_going flag 
    	mov	r4, r0				@ reset pointer
            sub     r5, r1, #1			@ reset counter
    	b	loop_start			@ start another iteration
    
    end_outer:
            ldmfd   sp!, {r4, r5, r6, r7, r8, pc}	@ restore state from stack and leave subroutime
    
    

    A bubble sort test harness to use the subroutine

    # movtest1.s
    @ An example of moving data from memory to a register
    .globl main
    
    .equ items,16
    #.equ items,0
    #.equ items,1
    .equ datum_size,4
    .section .rodata
    before:
    	.asciz "Before sorting, values are : %d" 
    after:
    	.asciz "After sorting,  values are : %d"
    comma:
    	.asciz ", %d"
    new_line:
    	.asciz "\n"
    ok:
    	.asciz "ok\n"
    .section .data
    values:
    	.word 105, -7, 235, 61, 28, 315, 456, 63, 134, 97, 221, 53, 1000899, 145, 117, 5
    evalues:
    	.word 1, 2, 3
    
    .text
    main:
    	ldr	r0, =values
    	mov	r1, #items
    	ldr	r2, =before
    	ldr	r3, =comma
    	bl	_vprintw
    	ldr	r0, =new_line
    	bl 	printf
    @	ldr	r0, =ok
    @	bl 	printf
    
    	ldr	r0, =values
    	mov	r1, #items
    	bl	_bubble
    @	ldr	r0, =ok
    @	bl 	printf
    
    	ldr	r0, =values
    	mov	r1, #items
    	ldr	r2, =after
    	ldr	r3, =comma
    	bl	_vprintw
    	ldr	r0, =new_line
    	bl 	printf
    
    	mov     r7, #1		@ set r7 to 1 - the syscall for exit
    	swi     0		@ then invoke the syscall from linux
    
    
    bob@poland:~/www/examples$ make bubble
    /usr/bin/gcc -gstabs -o bubble bubble.s bubble_sub.s vprintw.s
    bob@poland:~/www/examples$ ./bubble 
    Before sorting, values are : 105, -7, 235, 61, 28, 315, 456, 63, 134, 97, 221, 53, 1000899, 145, 117, 5
    After sorting,  values are : -7, 5, 28, 53, 61, 63, 97, 105, 117, 134, 145, 221, 235, 315, 456, 1000899
    bob@poland:~/www/examples$
    

    A program to demonstrate the use of the floating-point unit - works on the RPi

    	.section	.rodata
    	.align	2
    string:
    	.asciz	"pi (%f) times e (%f) is %f\n"
    string2:
    	.asciz	"float (%f)\n"
    
    	.text
    	.align	2
    	.global	main
    	.type	main, %function
    main:
    	stmfd	sp!, {fp, lr}
    	sub	sp, sp, #16
    
    	ldr	r3, const_pi
    	str	r3, [r1]
    	flds	s14, [r1]
    	fcvtds	d5, s14
    
    	ldr	r3, const_e
    	str	r3, [r2]
    	flds	s15, [r2]
    	fcvtds	d6, s15
    
    	fmuls	s15, s14, s15
    	fcvtds	d7, s15
    
    	fstd	d6, [sp]
    	fstd	d7, [sp, #8]
    	ldr	r0, =string
    	fmrrd	r2, r3, d5
    	bl	printf
    
    	ldr	r0, =string2
    	fmrrd	r2, r3, d5
    	bl	printf
    
    	ldr	r0, =string2
    	fmrrd	r2, r3, d6
    	bl	printf
    
    	ldr	r0, =string2
    	fmrrd	r2, r3, d7
    	bl	printf
    
    	mov	r0, r3
    	add	sp, sp, #16
    	ldmfd	sp!, {fp, pc}
    
    	mov	r0, #0
    	mov	r7, #1		@ set r7 to 1 - the syscall for exit
    	swi	0		@ then invoke the syscall from linux
    
    	.align	2
    const_pi:
    	.float	3.1415926
    const_e:
    	.float	2.718281
    
    
  5. A number of problems from projecteuler.net coded in ARM assembly language
  6. Problems here