Skip to content

Commit

Permalink
ch02 refactored common parts, deduped code
Browse files Browse the repository at this point in the history
  • Loading branch information
spamegg1 committed Apr 5, 2024
1 parent 0c11705 commit f32bccc
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 80 deletions.
17 changes: 12 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ I noticed many things have changed.

### Unused lines of code in the book (probably errors)

There are lines of code in the zip file provided on [the book's website](https://media.pragprog.com/titles/rwscala/code/rwscala-code.zip).
There are lines of code in the zip file provided on [the book's website](https://media.pragprog.com/titles/rwscala/code/rwscala-code.zip). Some of these are also printed in the book!

For example, in Chapter 4's `nativeFork` there is

Expand All @@ -48,24 +48,31 @@ which is never used. There are lots of other examples. There are also many unuse

There is also a lot of code duplication, I suppose, to make each individual file "runnable" by itself. I removed redundant code by adding package declarations, then importing the duplicated code from other files instead.

For example, Chapter 4's `badExec.scala` duplicates a lot of code from `nativeFork.scala`. I solved it by adding package declarations:
For example, Chapter 4's `badExec.scala` duplicates a lot of code from `nativeFork.scala`. I solved it by separating duplicate code into a file, and adding package declarations:

```scala
// this is common.scala
package ch04.common
```

```scala
// this is nativeFork.scala
package ch04.nativeFork

import ch04.common
// ...
```

```scala
// this is badExec.scala
package ch04.badExec

import ch04.nativeFork.runCommand
import ch04.common
// ...
// then use runCommand in @main
// then use common.runCommand in @main
```

There is a lot of this duplication in later chapters. I fixed them.
There is a lot of this duplication in later chapters. I'll fix them.

### `CSize / USize` instead of `Int`

Expand Down
69 changes: 69 additions & 0 deletions src/main/scala/ch02/00common.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package ch02.common

import scalanative.unsigned.UnsignedRichInt // .toUSize
import scalanative.unsafe.{Ptr, sizeof, CString, CStruct4, CFuncPtr2, CQuote, extern}
import scalanative.libc.{stdio, stdlib, string}
import scala.scalanative.unsigned.{ULong, USize}
import scalanative.unsafe.CSize

// First, data definitions.
type NGramData = CStruct4[CString, Int, Int, Int] // 8 + 4 + 4 + 4 = 20 bytes

// We need an array of NGramData's.
final case class WrappedArray[T]( // here T = NGramData
var data: Ptr[T], // pointer to the start of an array of multiple NGramData structs.
var used: Int, // how many structs are currently used
var capacity: Int // how many structs it can hold
)

// allocate an array of uninitialized NGramData structs. size = how many NGrams we want.
def makeWrappedArray(size: Int): WrappedArray[NGramData] =
val data: Ptr[NGramData] = stdlib
.malloc(size.toUSize * sizeof[NGramData]) // 0.5: use .toUSize
.asInstanceOf[Ptr[NGramData]]
WrappedArray[NGramData](data, 0, size) // when we free this, we will free data only.

// Grow an array's capacity by given size. realloc copies existing data if necessary.
def growWrappedArray(array: WrappedArray[NGramData], size: Int): Unit =
val newCapacity: Int = array.capacity + size
val newSize: USize = newCapacity.toUSize * sizeof[NGramData] // 0.5: use .toUSize
val newData: Ptr[Byte] = stdlib.realloc(array.data.asInstanceOf[Ptr[Byte]], newSize)
array.data = newData.asInstanceOf[Ptr[NGramData]] // update in-place
array.capacity = newCapacity // update in-place

// Here, array = data in a WrappedArray. Long sequence of NGramData's.
def freeArray(array: Ptr[NGramData], size: Int): Unit =
// First, free all the strings that structs are pointing at.
for i <- 0 until size do
val item: Ptr[NGramData] = array + i // 0.5
stdlib.free(item._1) // string is the first field of the struct.
stdlib.free(array.asInstanceOf[Ptr[Byte]]) // now free the structs themselves.

// Two comparison functions. This is a bit inefficient.
val byCountNaive = CFuncPtr2.fromScalaFunction[Ptr[Byte], Ptr[Byte], Int]:
(p1: Ptr[Byte], p2: Ptr[Byte]) =>
val nGramPtr1 = p1.asInstanceOf[Ptr[NGramData]]
val nGramPtr2 = p2.asInstanceOf[Ptr[NGramData]]
val count1 = nGramPtr1._3 // count is the third field, book is wrong, it has ._2
val count2 = nGramPtr2._3
if count1 > count2 then -1 // these are a bit inefficient
else if count1 == count2 then 0
else 1

// This one is a bit more efficient, returns neg, 0, pos.
val byCount = CFuncPtr2.fromScalaFunction[Ptr[Byte], Ptr[Byte], Int]:
(p1: Ptr[Byte], p2: Ptr[Byte]) =>
val nGramPtr1 = p1.asInstanceOf[Ptr[NGramData]]
val nGramPtr2 = p2.asInstanceOf[Ptr[NGramData]]
val count1 = nGramPtr1._3
val count2 = nGramPtr2._3
count2 - count1

@extern
object qsort:
def qsort(
data: Ptr[Byte],
num: Int,
size: Long,
comparator: CFuncPtr2[Ptr[Byte], Ptr[Byte], Int]
): Unit = extern
72 changes: 5 additions & 67 deletions src/main/scala/ch02/01sortByCount.scala
Original file line number Diff line number Diff line change
@@ -1,43 +1,10 @@
package ch02.sortByCount

import scalanative.unsigned.UnsignedRichInt // .toUSize
import scalanative.unsafe.{Ptr, sizeof, CString, CStruct4, CFuncPtr2, CQuote, extern}
import scalanative.unsafe.{Ptr, CQuote, CSize, sizeof}
import scalanative.libc.{stdio, stdlib, string}
import scala.scalanative.unsigned.{ULong, USize}
import scalanative.unsafe.CSize

// First, data definitions.
type NGramData = CStruct4[CString, Int, Int, Int] // 8 + 4 + 4 + 4 = 20 bytes

// We need an array of NGramData's.
final case class WrappedArray[T]( // here T = NGramData
var data: Ptr[T], // pointer to the start of an array of multiple NGramData structs.
var used: Int, // how many structs are currently used
var capacity: Int // how many structs it can hold
)

// allocate an array of uninitialized NGramData structs. size = how many NGrams we want.
def makeWrappedArray(size: Int): WrappedArray[NGramData] =
val data: Ptr[NGramData] = stdlib
.malloc(size.toUSize * sizeof[NGramData]) // 0.5: use .toUSize
.asInstanceOf[Ptr[NGramData]]
WrappedArray[NGramData](data, 0, size) // when we free this, we will free data only.

// Grow an array's capacity by given size. realloc copies existing data if necessary.
def growWrappedArray(array: WrappedArray[NGramData], size: Int): Unit =
val newCapacity: Int = array.capacity + size
val newSize: USize = newCapacity.toUSize * sizeof[NGramData] // 0.5: use .toUSize
val newData: Ptr[Byte] = stdlib.realloc(array.data.asInstanceOf[Ptr[Byte]], newSize)
array.data = newData.asInstanceOf[Ptr[NGramData]] // update in-place
array.capacity = newCapacity // update in-place

// Here, array = data in a WrappedArray. Long sequence of NGramData's.
def freeArray(array: Ptr[NGramData], size: Int): Unit =
// First, free all the strings that structs are pointing at.
for i <- 0 until size do
val item: Ptr[NGramData] = array + i // 0.5
stdlib.free(item._1) // string is the first field of the struct.
stdlib.free(array.asInstanceOf[Ptr[Byte]]) // now free the structs themselves.
import ch02.common
import common.{qsort, NGramData, byCount}

// this temporary space is used to read the string in each line of the file.
val tempWord: Ptr[Byte] = stdlib.malloc(1024) // 0.5
Expand Down Expand Up @@ -67,26 +34,6 @@ def parseLine(lineBuffer: Ptr[Byte], data: Ptr[NGramData]): Unit =
string.strncpy(newString, tempWord, wordLength + 1.toUSize) // 0.5
data._1 = newString // update in-place, initialization of struct complete.

// Two comparison functions. This is a bit inefficient.
val byCountNaive = CFuncPtr2.fromScalaFunction[Ptr[Byte], Ptr[Byte], Int]:
(p1: Ptr[Byte], p2: Ptr[Byte]) =>
val nGramPtr1 = p1.asInstanceOf[Ptr[NGramData]]
val nGramPtr2 = p2.asInstanceOf[Ptr[NGramData]]
val count1 = nGramPtr1._3 // count is the third field, book is wrong, it has ._2
val count2 = nGramPtr2._3
if count1 > count2 then -1 // these are a bit inefficient
else if count1 == count2 then 0
else 1

// This one is a bit more efficient, returns neg, 0, pos.
val byCount = CFuncPtr2.fromScalaFunction[Ptr[Byte], Ptr[Byte], Int]:
(p1: Ptr[Byte], p2: Ptr[Byte]) =>
val nGramPtr1 = p1.asInstanceOf[Ptr[NGramData]]
val nGramPtr2 = p2.asInstanceOf[Ptr[NGramData]]
val count1 = nGramPtr1._3
val count2 = nGramPtr2._3
count2 - count1

// MAIN
// reached EOF after 86618505 lines in 1358520 ms
// sorting done in 2764701 ms
Expand Down Expand Up @@ -115,12 +62,12 @@ val byCount = CFuncPtr2.fromScalaFunction[Ptr[Byte], Ptr[Byte], Int]:
def sortByCount(args: String*): Unit =
val blockSize = 1048576 // 2^20 NGramData items = 2^20 * 20 bytes = 20 MB
val lineBuffer = stdlib.malloc(1024) // 0.5
var array = makeWrappedArray(blockSize)
var array = common.makeWrappedArray(blockSize)
val readStart = System.currentTimeMillis()

// pipe file into stdin, then read line by line (fgets respects newlines)
while stdio.fgets(lineBuffer, 1024, stdio.stdin) != null do // reads <= 1024 - 1 chars
if array.used >= array.capacity then growWrappedArray(array, blockSize)
if array.used >= array.capacity then common.growWrappedArray(array, blockSize)
parseLine(lineBuffer, array.data + array.used) // parse CURRENT line
array.used += 1

Expand Down Expand Up @@ -160,12 +107,3 @@ def sortByCount(args: String*): Unit =

// def parseLineNaive(fd: Ptr[stdio.FILE], array: Ptr[NGramData]): Unit =
// ??? // TODO

@extern
object qsort:
def qsort(
data: Ptr[Byte],
num: Int,
size: Long,
comparator: CFuncPtr2[Ptr[Byte], Ptr[Byte], Int]
): Unit = extern
12 changes: 5 additions & 7 deletions src/main/scala/ch02/02aggregateAndCount.scala
Original file line number Diff line number Diff line change
@@ -1,20 +1,18 @@
package ch02.aggregateAndCount

import scalanative.unsigned.UnsignedRichInt // .toUSize
import scalanative.unsafe.{Ptr, CSize, CQuote, CString, sizeof, stackalloc}
import scalanative.libc.{stdio, stdlib, string}, stdio.FILE
import scalanative.unsafe.{Ptr, stackalloc, sizeof, CString, CStruct4, CFuncPtr2, CQuote}
import scalanative.unsafe.extern

import ch02.sortByCount.{NGramData, WrappedArray, makeWrappedArray, growWrappedArray}
import ch02.sortByCount.{byCount, qsort}
import ch02.common
import common.{qsort, NGramData, byCount, WrappedArray}

val lineBuffer = stdlib.malloc(1024) // 0.5
val tempWordBuffer = stdlib.malloc(1024)
val blockSize = 1048576 // ~ 1MB - too big? (same)

@main
def aggregateAndCount(args: String*): Unit =
val array = makeWrappedArray(blockSize)
val array = common.makeWrappedArray(blockSize)
val readStart = System.currentTimeMillis()
val linesRead = readAllLines(stdio.stdin, array) // NEW, different
val readElapsed = System.currentTimeMillis() - readStart
Expand Down Expand Up @@ -44,7 +42,7 @@ def readAllLines(fd: Ptr[FILE], array: WrappedArray[NGramData]): Long =
var linesRead = 0L

while stdio.fgets(lineBuffer, 1024, fd) != null do
if array.used >= array.capacity - 1 then growWrappedArray(array, blockSize)
if array.used >= array.capacity - 1 then common.growWrappedArray(array, blockSize)
parseAndCompare(lineBuffer, array) // this is the main difference!
linesRead += 1

Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/ch04/03nativePipe.scala
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package ch04.nativePipe

import scalanative.unsafe.{stackalloc}
import scalanative.unsafe.stackalloc
import scalanative.posix.unistd // getpid
import collection.mutable.ArrayBuffer
import ch04.common
Expand Down

0 comments on commit f32bccc

Please sign in to comment.