skx · skx · Jun 15, 2020 · Jun 15, 2020 · Jun 15, 2020 · Jun 15, 2020
diff --git a/lexer.go b/lexer.go
@@ -0,0 +1,135 @@
+package main
+
+import "strings"
+
+// These constants are our token-types
+const (
+	EOF = "EOF"
+
+	//
+	// TODO: Better names.
+	//
+	// Are there standard values?
+	//
+	LESS       = "<"
+	GREATER    = ">"
+	PLUS       = "+"
+	MINUS      = "-"
+	OUTPUT     = "."
+	INPUT      = ","
+	LOOP_OPEN  = "["
+	LOOP_CLOSE = "]"
+)
+
+// Token contains the next token from the input program.
+type Token struct {
+
+	// Type contains the token-type (such as "<", "[", etc).
+	Type string
+
+	// Repeat contains the number of consecutive appearances we've seen
+	// of this token.
+	Repeat int
+}
+
+// Lexer holds our lexer state.
+type Lexer struct {
+
+	// input is the string we're lexing.
+	input string
+
+	// position is the current position within the input-string.
+	position int
+
+	// simple map of single-character tokens to their type
+	known map[string]string
+}
+
+// NewLexer creates a new Lexer, which will parse the specified
+// input program into a series of tokens.
+func NewLexer(input string) *Lexer {
+
+	// Create the lexer object.
+	l := &Lexer{input: input}
+
+	// Strip newlines/spaces from our iput
+	l.input = strings.ReplaceAll(l.input, "\n", "")
+	l.input = strings.ReplaceAll(l.input, "\r", "")
+	l.input = strings.ReplaceAll(l.input, " ", "")
+
+	// Populate the simple token-types in a map for
+	// later use.
+	l.known = make(map[string]string)
+
+	l.known["+"] = PLUS
+	l.known["-"] = MINUS
+	l.known[">"] = GREATER
+	l.known["<"] = LESS
+	l.known[","] = INPUT
+	l.known["."] = OUTPUT
+	l.known["["] = LOOP_OPEN
+	l.known["]"] = LOOP_CLOSE
+
+	return l
+}
+
+// Next returns the next token from our input stream.
+//
+// This is pretty naive lexer because we only have to consider
+// single-character tokens.  However we do look for tokens which
+// are repeated.
+func (l *Lexer) Next() *Token {
+
+	// Loop until we've exhausted our input.
+	for l.position < len(l.input) {
+
+		// Get the next character
+		char := string(l.input[l.position])
+
+		// Is this a known character/token?
+		_, ok := l.known[char]
+		if ok {
+
+			//
+			// Some tokens can't repeat.  Horrid.
+			//
+			if char == INPUT || char == OUTPUT || char == LOOP_OPEN || char == LOOP_CLOSE {
+				l.position++
+				return &Token{Type: char, Repeat: 1}
+			}
+
+			// OK record our starting position
+			begin := l.position
+
+			// Loop forward to see if that character
+			// is repeated further times
+			for l.position < len(l.input) {
+
+				// If it isn't the same character
+				// we're done
+				if string(l.input[l.position]) != char {
+					break
+				}
+
+				// Otherwise keep advancing forward
+				l.position++
+			}
+
+			// Return the token and the times it was
+			// seen in adjacent positions
+			count := l.position - begin
+			return &Token{Type: char, Repeat: count}
+		}
+
+		//
+		// Here we're ignoring a token which was unknown.
+		//
+		l.position++
+	}
+
+	//
+	// If we got here then we're at/after the end of our input
+	// string.  So we just return EOF.
+	//
+	return &Token{Type: EOF, Repeat: 1}
+}
diff --git a/lexer_test.go b/lexer_test.go
@@ -0,0 +1,62 @@
+package main
+
+import (
+	"testing"
+)
+
+// TestLexer performs a trivial test of the lexer
+func TestLexer(t *testing.T) {
+
+	tests := []struct {
+		expectedType  string
+		expectedCount int
+	}{
+		{PLUS, 1},
+		{MINUS, 1},
+		{LESS, 5},
+		{GREATER, 5},
+		{LOOP_OPEN, 1},
+		{LOOP_CLOSE, 1},
+		{OUTPUT, 1},
+		{INPUT, 1},
+		{EOF, 1},
+	}
+
+	l := NewLexer("+-<<<<<\n>>>>>[].,")
+
+	for i, tt := range tests {
+		tok := l.Next()
+		if tok.Type != tt.expectedType {
+			t.Fatalf("tests[%d] - tokentype wrong, expected=%q, got=%q", i, tt.expectedType, tok.Type)
+		}
+		if tok.Repeat != tt.expectedCount {
+			t.Fatalf("tests[%d] - count wrong, expected=%d, got=%d", i, tt.expectedCount, tok.Repeat)
+		}
+	}
+}
+
+// TestAdjacent is designed to ensure we count adjacent runs of characters
+// even when newlines are in the way.
+func TestAdjacent(t *testing.T) {
+
+	tests := []struct {
+		expectedType  string
+		expectedCount int
+	}{
+		{PLUS, 5},
+		{MINUS, 5},
+		{EOF, 1},
+	}
+
+	l := NewLexer("+\n+\n+\n+\n+- - - - -")
+
+	for i, tt := range tests {
+		tok := l.Next()
+		if tok.Type != tt.expectedType {
+			t.Fatalf("tests[%d] - tokentype wrong, expected=%q, got=%q", i, tt.expectedType, tok.Type)
+		}
+		if tok.Repeat != tt.expectedCount {
+			t.Fatalf("tests[%d] - count wrong, expected=%d, got=%d", i, tt.expectedCount, tok.Repeat)
+		}
+	}
+}
diff --git a/main.go b/main.go
@@ -33,55 +33,62 @@ _start:
 	//
 	opens := []int{}
 
+	//
+	// Create a lexer for the input program
+	//
+	l := NewLexer(source)
+
+	//
+	// Loop forever, processing the next token
+	//
+	tok := l.Next()
+
+	//
+	// We keep track of the loop-labels here.
+	//
+	// Each time we see a new loop-open "[" we bump this
+	// by one.
+	//
 	i := 0
-	ln := len(source)
 
-	for i < ln {
+	//
+	// We'll process the complete program until
+	// we hit an end of file/input
+	//
+	for tok.Type != EOF {
 
-		switch source[i] {
-		case '>':
-			end := i
-			for source[end] == '>' {
-				end++
-			}
-			buff.WriteString(fmt.Sprintf("  add r8, %d\n", end-i))
-			i = end - 1
-		case '<':
-			end := i
-			for source[end] == '<' {
-				end++
-			}
-			buff.WriteString(fmt.Sprintf("  sub r8, %d\n", end-i))
-			i = end - 1
-		case '+':
-			end := i
-			for source[end] == '+' {
-				end++
-			}
-			buff.WriteString(fmt.Sprintf("  add byte [r8], %d\n", end-i))
-			i = end - 1
-		case '-':
-			end := i
-			for source[end] == '-' {
-				end++
-			}
-			buff.WriteString(fmt.Sprintf("  sub byte [r8], %d\n", end-i))
-			i = end - 1
-		case '.':
-			// output
+		//
+		// Output different things depending on the token-type
+		//
+		switch tok.Type {
+
+		case GREATER:
+			buff.WriteString(fmt.Sprintf("  add r8, %d\n", tok.Repeat))
+
+		case LESS:
+			buff.WriteString(fmt.Sprintf("  sub r8, %d\n", tok.Repeat))
+
+		case PLUS:
+			buff.WriteString(fmt.Sprintf("  add byte [r8], %d\n", tok.Repeat))
+
+		case MINUS:
+			buff.WriteString(fmt.Sprintf("  sub byte [r8], %d\n", tok.Repeat))
+
+		case OUTPUT:
 			buff.WriteString("  mov rax, 1\n")  // SYS_WRITE
 			buff.WriteString("  mov rdi, 1\n")  // STDOUT
 			buff.WriteString("  mov rsi, r8\n") // data-comes-here
 			buff.WriteString("  mov rdx, 1\n")  // one byte
 			buff.WriteString("  syscall\n")     // Syscall
-		case ',':
-			// input
+
+		case INPUT:
 			buff.WriteString("  mov rax, 0\n")  // SYS_READ
 			buff.WriteString("  mov rdi, 0\n")  // STDIN
 			buff.WriteString("  mov rsi, r8\n") // Dest
 			buff.WriteString("  mov rdx, 1\n")  // one byte
 			buff.WriteString("  syscall\n")     // syscall
-		case '[':
+
+		case LOOP_OPEN:
 
 			//
 			// Open of a block.
@@ -92,11 +99,14 @@ _start:
 			// NOTE: We repeat the test at the end of the
 			// loop so the label here is AFTER our condition
 			//
+			i++
 			buff.WriteString("  cmp byte [r8], 0\n")
 			buff.WriteString(fmt.Sprintf("  je close_loop_%d\n", i))
 			buff.WriteString(fmt.Sprintf("label_loop_%d:\n", i))
 			opens = append(opens, i)
-		case ']':
+
+		case LOOP_CLOSE:
+
 			// "]" can only follow an "[".
 			//
 			// Every time we see a "[" we save the ID onto a
@@ -105,6 +115,10 @@ _start:
 			//
 			// This will cope with nesting.
 			//
+			if len(opens) < 1 {
+				fmt.Printf("close before open.  bug?  bogus program?\n")
+				os.Exit(1)
+			}
 
 			//
 			// Get the last label-ID
@@ -141,9 +155,16 @@ _start:
 
 			buff.WriteString(fmt.Sprintf("  jne label_loop_%d\n", last))
 			buff.WriteString(fmt.Sprintf("close_loop_%d:\n", last))
+
+		default:
+			fmt.Printf("token not handled: %v\n", tok)
+			os.Exit(1)
 		}
 
-		i++
+		//
+		// Keep processing
+		//
+		tok = l.Next()
 	}
 
 	// terminate