chastext 64-bit

I have converted my chastext program to 64 bit Assembly for Linux. Next to chastehex, this is the program I am most proud of because it can find and replace exact strings of text. It isn’t quite the same as the Linux “sed” tool, but it is faster, smaller, and I wrote it myself and can do whatever I want with it.

So of course what I did was write shell script to show what it is capable of!

main.asm

;Linux 64-bit Assembly Source for chastext
;a basic text search and replace program
format ELF64 executable
entry main

include 'chastelib64.asm'

main:

pop rax
mov [argc],rax ;save the argument count for later

cmp qword [argc],1
ja help_skip ;if more than 1 argument is given, skip the help message and process the other arguments

help:
mov rax,help_message
call putstring
jmp main_end
help_skip:

pop rax ;pop the next arg which is the name of the program we are running

get_filename:
pop rax ;pop the next arg which is the name of the file we will open

mov [filename],rax ; save the name of the file we will open to read

arg_open_file:

;Linux system call to open a file

mov rsi,0   ;open file in read only mode
mov rdi,rax ;filename should be in rax before this function was called
mov rax,2   ;invoke SYS_OPEN (kernel opcode 2 on 64 bit systems)
syscall     ;call the kernel

cmp rax,0
jns file_open_no_errors ;if rax is not negative/signed there was no error

;Otherwise, if it was signed, then this code will display an error message.

mov rax,open_error_message
call putstr_and_line

jmp main_end ;end the program because we failed at opening the file

file_open_no_errors:

mov [filedesc],rax ; save the file descriptor number for later use

;before we just textdump or "cat" the file, we need to check for the existence of more arguments which will modify the output

cmp qword[argc],3
jb search_skip

pop rax ;pop the next arg which is the string we are searching for
mov [string_search],rax

search_skip:

cmp qword[argc],4
jb replace_skip

pop rax ;pop the next arg which is the string we are searching for
mov [string_replace],rax

replace_skip:

;now we begin displaying the file but also searching for the search string if it exists. We will check for these based on the number of arguments like we did earlier

textdump:

;if only there are only 2 arguments (name of program plus input file)
;then we do a loop that ignores searching and replacing
;this loop will read one character from the file and then send it to stdout
;until there are no more bytes to display
;but if there are above 2 arguments, we skip this loop and go to search mode

cmp qword[argc],2 ;test arguments 2=only filename given
ja search_mode    ;but if above 2, then go to search mode because a search string was given

;This loop is the same as the Linux 'cat' command
;or the DOS 'type' command for a single file
;it will read one byte and echo it to standard output until EOF

cat:

mov rdx,1            ;number of bytes to read
mov rsi,byte_array   ;address to store the bytes
mov rdi,[filedesc]   ;move the opened file descriptor into rdi
mov rax,0            ;invoke SYS_READ (kernel opcode 0 on 64 bit Intel)
syscall              ;call the kernel

mov [bytes_read],rax

cmp rax,0
jnz file_success ;if more than zero bytes read, proceed to display

jmp main_end ;otherwise, end the program

; this point is reached if file was read from successfully

file_success:

;print the last read character to stdout by switching to write call
mov rdi,1            ;write to the STDOUT file
mov rax,1          ;invoke SYS_WRITE (kernel opcode 1 on 64 bit systems)
syscall            ;system call to write the message

jmp cat

search_mode:

;this is the beginning of search mode
;it handles the file by seeking and reading to search every position for the search string

;first, seek to the file_address we initialized to zero
;this variable will be added to depending on actions taken

mov rdx,0              ;whence argument (SEEK_SET)
mov rsi,[file_address] ;move the file cursor to this address
mov rdi,[filedesc]     ;move the opened file descriptor into rbx
mov rax,8              ;invoke SYS_LSEEK (kernel opcode 8 on 64 bit Intel)
syscall                ;call the kernel

;obtain the length of the search string using my strlen function
mov rax,[string_search]
call strlen ;get the length of the search string

;use the length of the string we are searching for as the number of bytes to read at this location

mov rdx,rax            ;number of bytes to read
mov rsi,byte_array     ;address to store the bytes
mov rdi,[filedesc]     ;move the opened file descriptor into rbx
mov rax,0              ;invoke SYS_READ (kernel opcode 0 on 64 bit Intel)
syscall                ;call the kernel

mov [bytes_read],rax   ;store how many bytes were read with that last read operation

mov rbx,byte_array     ;move the address of bytes read into rbx
add rbx,rax            ;add number of bytes read (return value of read function in rax)
mov byte[rbx],0        ;terminate the string with zero

cmp rax,rdx ;if the number of bytes is not what we expected to read, end this loop
jnz textdump_end

;move our two strings into the rsi and rdi registers for comparison
;with my custom written strcmp function

mov rsi,[string_search]
mov rdi,byte_array
call strcmp ;compare these two strings

cmp rax,0 ;test if they are the same (if rax returned zero)
jnz not_match ;if they are not a match go to that section for printing a character

;but if they are a match, then we either quote them
;or replace them if a replacement string is available

;but regardless of which action we do, since a match was found, let us add this count to the file address
;so that we read from beyond this point next time the textdump loop starts
mov rax,[bytes_read]
add [file_address],rax

cmp qword[argc],4 ;if less than 4 args, no replacement exist, so we quote the strings
jb print_quotes

;otherwise, we will print the replacement string instead of the original!

mov rax,[string_replace]
call putstring ;print the string

jmp textdump ;restart the main loop

print_quotes:
;print quotes around matched string
mov al,'"'
call putchar

mov rax,byte_array
call putstring ;print the string

mov al,'"'
call putchar

jmp textdump ;restart the main loop

not_match: 

;Instead of calling the putchar function in the case of no match,
;I do a system call to print 1 byte to standard output
;This is simple and also compatible with binary files we want to replace text in.
;But it only works if the search and replace strings are of the same length

mov rdx,1            ;number of bytes to write == 1
mov rsi,byte_array   ;pointer/address of string to write
mov rdi,1            ;write to the STDOUT file
mov rax,1            ;invoke SYS_WRITE (kernel opcode 1 on 64 bit systems)
syscall              ;system call to write the message

add [file_address],1 ;add 1 to the file address so we don't read this same position again

jmp textdump

textdump_end:

;print the remaining bytes, if any, left after the main loop ended
;mov rax,byte_array
;call putstring

mov rdx,[bytes_read] ;number of bytes to write == last read call result
mov rsi,byte_array   ;pointer/address of string to write
mov rdi,1            ;write to the STDOUT file
mov rax,1            ;invoke SYS_WRITE (kernel opcode 1 on 64 bit systems)
syscall              ;system call to write the message

main_end:

;this is the end of the program
;we close the open file and then use the exit call

;Linux system call to close a file

mov rdi,[filedesc] ;file number to close
mov rax,3          ;invoke SYS_CLOSE (kernel opcode 3 for 64 bit Intel)
syscall            ;call the kernel

mov rax, 0x3C ; invoke SYS_EXIT (kernel opcode 0x3C (60 decimal) on 64 bit systems)
mov rdi,0   ; return 0 status on exit - 'No Errors'
syscall

;the strlen and strcmp are named after the equivalent C functions
;but are written from scratch by me based on their expected behavior

;The strlen function gets the length of string in rax and returns it in rax
;This is the same algorithm used in my putstring function

strlen:

push rbx
mov rbx,rax ; copy rax to rbx. rbx will be used as index to the string

strlen_start: ; this loop finds the length of the string

cmp [rbx],byte 0 ; compare byte at address rbx with 0
jz strlen_end ; if comparison was zero, jump to loop end
inc rbx
jmp strlen_start

strlen_end:
sub rbx,rax ;subtract start pointer from current pointer to get length of string
mov rax,rbx ;copy the string length back to rax
pop rbx

ret

;strcmp compares the string at rsi to the one at rdi
;rax returns 0 if the strings are the same and 1 if different
;the algorithm is simple but I will explain it for those who are confused

;rax is initialized to zero
;a byte from each string is loaded into the al and bl registers
;the bytes are compared. if they are different, then we jump to the end
;However, if they are the same, then we check if one of them is zero
;for this purpose it doesn't matter whether we compare al or bl with zero
;because it is known that they are the same if the jnz did not take place
;if it is zero, this also jumps to the end of the function
;If neither jump took place, then we jump to the start of the loop
;but when the function finally ends bl will be subtracted from al
;this ensures that the function returns zero if the final characters are the same
;rbx,rsi,and rdi are preserved but rax is the return value
;also, the sub instruction at the end of the function also updates the flags
;so you can "jz" or "jnz" to a label after calling this function based on results

strcmp:

push rbx
push rsi
push rdi

mov rax,0

strcmp_start:

;read a byte from each string
mov al,[rdi]
mov bl,[rsi]
cmp al,bl
jnz strcmp_end

cmp al,0
jz strcmp_end

inc rdi
inc rsi

jmp strcmp_start

strcmp_end:
sub al,bl

pop rdi
pop rsi
pop rbx

ret

help_message db 'chastext by Chastity White Rose',0Ah,0Ah
db '"cat" a file:',0Ah,0Ah,9,'chastext file',0Ah,0Ah
db 'search for a string:',0Ah,0Ah,9,'chastext file search',0Ah,0Ah
db 'replace string:',0Ah,0Ah,9,'chastext file search replace',0Ah,0Ah
db 'Find or replace any string!',0Ah,0

open_error_message db 'error while opening file',0

file_address dq 0 ;file address defaults to zero AKA beginning of file

;variables for managing arguments and files
argc rq 1
filename rq 1 ; name of the file to be opened
filedesc rq 1 ; file descriptor
bytes_read rq 1

string_search rq 1 ; place to hold the search string pointer
string_replace rq 1 ; place to hold the replacement string pointer

;where we will store data from the file
byte_array db 0xA4 dup 0

chastelib64.asm

; chastelib assembly header file for 64 bit Linux
; This file is where I keep the source of my most important Assembly functions
; These are my string and integer output and conversion routines.

; To simplify documentation. The Accumulator/Arithmetic register
; (ax,eax,rax) depending on bit size shall be referred to as register A
; for the description of these core functions because the A register
; is treated special both by the Intel company and my code;

; putstring; Prints a zero terminated string from the address pointer to by A register.
; intstr;    Converts the number in A into a zero terminated string and points A to that address
; putint;    Prints the integer in A by calling intstr and then putstring.
; strint;    Converts the zero terminated string into an integer and sets A to that value
   
; Now, the source of the functions begins, with comments included for parts that I felt needed explanation.

putstring:

push rax
push rbx
push rcx
push rdx

mov rbx,rax ;copy eax to ebx to be used as index to the string

putstring_strlen_start: ; this loop finds the length of the string as part of the putstring function

cmp [rbx],byte 0 ; compare byte at address rbx with 0
jz putstring_strlen_end ; if comparison was zero, jump to loop end because we have found the length
inc rbx
jmp putstring_strlen_start

putstring_strlen_end:
sub rbx,rax ;subtract start pointer from current pointer to get length of string

;Write string using Linux Write system call.
;Reference for 64 bit x86 syscalls is below.
;https://www.chromium.org/chromium-os/developer-library/reference/linux-constants/syscalls/#x86_64-64-bit

mov rdx,rbx      ;number of bytes to write
mov rsi,rax      ;pointer/address of string to write
mov rdi,1        ;write to the STDOUT file
mov rax,1        ;write (kernel opcode 1 on 64 bit systems)
syscall          ;system call for 64-bit Linux kernel

pop rdx
pop rcx
pop rbx
pop rax

ret ; this is the end of the putstring function return to calling location

; This is the location in memory where digits are written to by the intstr function
; The string of bytes and settings such as the radix and width are global variables defined below.

int_string db 64 dup '?' ;reserve bytes for characters string for 64-bit binary integer

int_string_end db 0 ;zero byte terminator for the integer string

radix dq 2 ;radix or base for integer output. 2=binary, 8=octal, 10=decimal, 16=hexadecimal
int_width dq 8 ;default width of integers. Extra zeros prefixed if more than 1

;this function creates a string of the integer in rax
;it uses the above radix variable to determine base from 2 to 36
;it then loads rax with the address of the string
;this means that it can be used with the putstring function

intstr:

mov rbx,int_string_end-1 ;find address of lowest digit(just before the newline 0Ah)
mov rcx,1

digits_start:

mov rdx,0;
div qword [radix]
cmp rdx,10
jb decimal_digit
jnb hexadecimal_digit

decimal_digit: ;we go here if it is only a digit 0 to 9
add rdx,'0'
jmp save_digit

hexadecimal_digit:
sub rdx,10
add rdx,'A'

save_digit:

mov [rbx],dl
cmp rax,0
jz intstr_end
dec rbx
inc rcx
jmp digits_start

intstr_end:

prefix_zeros:
cmp rcx,[int_width]
jnb end_zeros
dec rbx
mov [rbx],byte '0'
inc rcx
jmp prefix_zeros
end_zeros:

mov rax,rbx ;point eax register to this string for putstring

ret

; function to print string form of whatever integer is in rax
; The radix determines which number base the string form takes.
; Anything from 2 to 36 is a valid radix
; in practice though, only bases 2,8,10,and 16 will make sense to other programmers
; this function does not process anything by itself but calls the combination of my other
; functions in the order I intended them to be used.

putint: 

push rax
push rbx
push rcx
push rdx

call intstr

call putstring

pop rdx
pop rcx
pop rbx
pop rax

ret

;this function converts a string pointed to by rax into an integer returned in rax instead
;it is a little complicated because it has to account for whether the character in
;a string is a decimal digit 0 to 9, or an alphabet character for bases higher than ten
;it also checks for both uppercase and lowercase letters for bases 11 to 36
;finally, it checks if that letter makes sense for the base.
;For example, G to Z cannot be used in hexadecimal, only A to F can
;The purpose of writing this function was to be able to accept user input as integers
;This function is improved with error checking and uses the new strint_error variable
;The program can check this value after the call and see how many errors happened.

strint_error db 0 ;declare a byte variable that keeps track of errors

strint:

mov rbx,rax ;copy string address from rax to rbx because rax will be replaced soon!
mov rax,0
mov [strint_error],0 ;set errors to 0 at the start of this function

read_strint:
mov rcx,0 ; zero rcx so only lower 8 bits are used
mov cl,[rbx]
inc rbx
cmp cl,0 ; compare byte at address rdx with 0
jz strint_end ; if comparison was zero, this is the end of string

;if char is below '0' or above '9', it is outside the range of these and is not a digit
cmp cl,'0'
jb not_digit
cmp cl,'9'
ja not_digit

;but if it is a digit, then correct and process the character
is_digit:
sub cl,'0'
jmp process_char

not_digit:
;it isn't a digit, but it could an alphabet character which is a digit in a higher base

;if char is below 'A' or above 'Z', it is outside the range of these and is not capital letter
cmp cl,'A'
jb not_upper
cmp cl,'Z'
ja not_upper

is_upper:
sub cl,'A'
add cl,10
jmp process_char

not_upper:

;if char is below 'a' or above 'z', it is outside the range of these and is not lowercase letter
cmp cl,'a'
jb not_lower
cmp cl,'z'
ja not_lower

is_lower:
sub cl,'a'
add cl,10
jmp process_char

not_lower:

;if we have reached this point, result invalid and end function with error
jmp strint_end_error

process_char:

cmp rcx,[radix] ;compare char with radix
jnb strint_end_error ;if this value is above or equal to radix, it is too high despite being a valid digit/alpha

mov rdx,0 ;zero rdx because it is used in mul sometimes
mul qword [radix] ;mul rax with radix
add rax,rcx

jmp read_strint ;jump back and continue the loop if nothing has exited it

strint_end_error: ;we jump here if there was an error with one of the chars
inc [strint_error] ;increment error counter because char invalid

strint_end: ;we jump here when no errors happened

ret

;The utility functions below simply print a space or a newline.
;these help me save code when printing lots of strings and integers.

space db ' ',0 ;a string containing only a space

putspace:
push rax
mov rax,space
call putstring
pop rax
ret

line db 0Ah,0 ;a string containing only a newline

;the next function which pushes rax to the stack
;moves the address of the line string and prints it with putstring
;then it pops the original value of rax back from the stack before the function returns
;this allows me to print a newline anywhere in the code without a single register changing

putline:
push rax
mov rax,line
call putstring
pop rax
ret

;a function for printing a single character that is the value of al

char: db 0,0

putchar:
push rax
mov [char],al
mov rax,char
call putstring
pop rax
ret

;a small function just for the common operation
;printing an integer followed by a space
;this saves a few bytes in the assembled code
;by reducing the number of function calls in the main program

putint_and_space:
call putint
call putspace
ret

;a small function just for the common operation
;printing an integer followed by a line feed
;this saves a few bytes in the assembled code
;by reducing the number of function calls in the main program

putint_and_line:
call putint
call putline
ret

;a small function just for the common operation
;printing a string followed by a line feed
;this saves a few bytes in the assembled code
;by reducing the number of function calls in the main program
;it also means we don't need to include a newline in every string!

putstr_and_line:
call putstring
call putline
ret

Comments

Please leave me any comments or questions you have! I will update posts if necessary based on user feedback!