Initial commit, adding work to date.

FossilOrigin-Name: e866fb59b96496de50c85d483ec89bdb8f48fb2d24c43b8bfecdc0a968962114
2025-03-15 20:26:58 +00:00 · 2025-03-15 20:26:58 +00:00 · 9ee085d86b
commit 9ee085d86b
parent ef8d5a489e
12 changed files with 2486 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1 @@
+**TBD**
--- a/experiments/imdb/imdbimport.nim
+++ b/experiments/imdb/imdbimport.nim
@ -0,0 +1,145 @@
+# vim: set et sta sw=4 ts=4 :
+#
+# Fetches movie and actor data from IMDB and converts it
+# to CSV, then imports into a Kuzu graph database.
+#
+# Only cares about actors in movies!  Things like writers,
+# directors, or TV shows are intentionally omitted.
+#
+# Compile:
+#   % nim c -d:release imdbdata.nim
+#
+# Sourced from: https://datasets.imdbws.com/
+# See: https://developer.imdb.com/non-commercial-datasets/
+
+import
+    std/os,
+    std/sequtils,
+    std/strformat,
+    std/strutils,
+    zip/gzipfiles,
+    kuzu
+
+const DB     = "imdb"
+const SOURCE = "https://datasets.imdbws.com"
+const FILES  = @[ "name.basics", "title.basics", "title.principals" ]
+
+#
+# Prep everything!
+#
+for file in FILES:
+    var c     = 0
+    let tsvgz = &"{file}.tsv.gz"
+    let csv   = &"{file}.csv"
+
+    if csv.fileExists:
+        echo &"Skipping {file}, csv already exists."
+        continue
+
+    if not tsvgz.fileExists:
+        echo &"Downloading file: {file}..."
+        discard execShellCmd &"wget {SOURCE}/{tsvgz}"
+
+    let tsv_stream = newGzFileStream( tsvgz )
+    let csv_file = open( &"{file}.csv", fmWrite )
+
+    case file
+        of "name.basics":
+            csv_file.write( "pid,name,birthYear,deathYear\n" )
+        of "title.basics":
+            csv_file.write( "mid,title,year,durationMins\n" )
+        of "title.principals":
+            csv_file.write( "pid,mid\n" )
+
+    var line = ""
+    while tsv_stream.readLine( line ):
+        c += 1
+        if c mod 1000 == 0: stderr.write( &"Parsing {file}... {c}\r" )
+
+        var row = line.split( '\t' )
+        try:
+            case file
+
+                # nconst primaryName birthYear deathYear primaryProfession knownForTitles
+                of "name.basics":
+                    row = row[0..3]
+                    row[0] = $row[0].replace( "nm" ).parseInt()
+
+                # tconst titleType primaryTitle originalTitle isAdult startYear endYear runtimeMinutes genres
+                of "title.basics":
+                    if row[1] != "movie": continue
+                    row.delete( 1 )
+                    for i in 0..1: row.delete( 2 )
+                    row.delete( 3 )
+                    discard row.pop()
+                    row[0] = $row[0].replace( "tt" ).parseInt()
+
+                # tconst ordering nconst category job characters
+                of "title.principals":
+                    if row[3] != "actor" and row[3] != "actress": continue
+                    row.delete( 1 )
+                    row = row[0..1]
+                    row[0] = $row[0].replace( "tt" ).parseInt()
+                    row[1] = $row[1].replace( "nm" ).parseInt()
+
+
+            if file.contains( ".basics" ):
+                row.applyIt(
+                    # empty value / null
+                    if it == "\\N": ""
+
+                    # RFC 4180 escapes
+                    elif it.contains( "\"" ) or it.contains( ',' ):
+                        var value = it
+                        value = value.replace( "\"", "\"\"" )
+                        "\"" & value & "\""
+
+                    else: it
+                )
+
+            csv_file.write( row.join(","), "\n" )
+
+        except ValueError:
+            continue
+
+    tsv_stream.close()
+    csv_file.close()
+    stderr.write( "\n" )
+
+
+#
+# Ok, now import into a fresh kuzu database.
+#
+
+var db   = newKuzuDatabase( DB )
+var conn = db.connect()
+
+if not DB.fileExists:
+    var duration = 0
+
+    for schema in @[
+        """CREATE NODE TABLE Actor (actorId INT64, name STRING, birthYear INT, deathYear INT, PRIMARY KEY (actorId))""",
+        """CREATE NODE TABLE Movie (movieId INT64, title STRING, year INT, durationMins INT, PRIMARY KEY (movieId))""",
+        """CREATE REL TABLE ActedIn (FROM Actor TO Movie)"""
+    ]:
+        var result = conn.query( schema )
+        duration += result.execution_time.int
+
+    echo &"Created database schema in {duration}ms."
+    duration = 0
+
+    for dataload in @[
+        """COPY Actor FROM "./name.basics.csv" (header=true, ignore_errors=true)""",
+        """COPY Movie FROM "./title.basics.csv" (header=true, ignore_errors=true)""",
+        """COPY ActedIn FROM "./title.principals.csv" (header=true, ignore_errors=true)"""
+    ]:
+        echo dataload
+        var result = conn.query( dataload )
+        duration += result.execution_time.int
+
+    echo &"Imported data in {duration / 1000}s."
+    echo "Done!"
+
+else:
+    echo &"Database appears to already exist, skipping data import."
+
--- a/kuzu.nimble
+++ b/kuzu.nimble
@ -0,0 +1,17 @@
+# vim: set et sta sw=4 ts=4 :
+
+version     = "0.1.0"
+author      = "Mahlon E. Smith"
+description = "Kuzu is an embedded graph database built for query speed and scalability."
+license     = "MIT"
+srcDir      = "src"
+
+requires "nim ^= 2.0.0"
+
+# Development dependencies.
+#requires "futhark ^= 0.15.0"
+#requires "zip ^= 0.3.1"
+
+task makewrapper, "Generate the C wrapper using Futhark":
+    exec "nim c -d:futharkWrap --outdir=. src/kuzu.nim"
+
--- a/src/kuzu.nim
+++ b/src/kuzu.nim
@ -0,0 +1,36 @@
+# vim: set et sta sw=4 ts=4 :
+#
+
+{.passL:"-lkuzu".}
+
+when defined( futharkWrap ):
+    import futhark, os
+
+    importc:
+        outputPath currentSourcePath.parentDir / "kuzu" / "0.8.2.nim"
+        "kuzu.h"
+else:
+    include "kuzu/0.8.2.nim"
+
+import
+    std/strformat
+
+include
+    "kuzu/constants.nim",
+    "kuzu/types.nim",
+    "kuzu/config.nim",
+    "kuzu/database.nim",
+    "kuzu/connection.nim",
+    "kuzu/queries.nim"
+
+when isMainModule:
+    echo "Nim-Kuzu version: ", KUZU_VERSION,
+        ". Expected library version: ", KUZU_EXPECTED_LIBVERSION, "."
+    echo "Installed Kuzu library version ", KUZU_LIBVERSION,
+        " (storage version ", KUZU_STORAGE_VERSION, ")"
+    if KUZU_EXPECTED_LIBVERSION == KUZU_LIBVERSION:
+        echo "Versions match!"
+    else:
+        echo "This library wraps a different version of Kuzu than what is installed."
+        echo "Behavior may be unexpected!"
+
--- a/src/kuzu/0.8.2.nim
+++ b/src/kuzu/0.8.2.nim
--- a/src/kuzu/config.nim
+++ b/src/kuzu/config.nim
@ -0,0 +1,24 @@
+# vim: set et sta sw=4 ts=4 :
+
+proc kuzuConfig*(
+    buffer_pool_size     = KUZU_DEFAULT_CONFIG.buffer_pool_size,
+    max_num_threads      = KUZU_DEFAULT_CONFIG.max_num_threads,
+    enable_compression   = KUZU_DEFAULT_CONFIG.enable_compression,
+    read_only            = KUZU_DEFAULT_CONFIG.read_only,
+    max_db_size          = KUZU_DEFAULT_CONFIG.max_db_size,
+    auto_checkpoint      = KUZU_DEFAULT_CONFIG.auto_checkpoint,
+    checkpoint_threshold = KUZU_DEFAULT_CONFIG.checkpoint_threshold
+    ): kuzu_system_config =
+    ## Returns a new kuzu database configuration object.
+
+    return kuzu_system_config(
+        buffer_pool_size:     buffer_pool_size,
+        max_num_threads:      max_num_threads,
+        enable_compression:   enable_compression,
+        read_only:            read_only,
+        max_db_size:          max_db_size,
+        auto_checkpoint:      auto_checkpoint,
+        checkpoint_threshold: checkpoint_threshold
+    )
+
+
--- a/src/kuzu/connection.nim
+++ b/src/kuzu/connection.nim
@ -0,0 +1,24 @@
+# vim: set et sta sw=4 ts=4 :
+
+proc `=destroy`*( conn: KuzuConnectionObj ) =
+    ## Graceful cleanup for open connection handles.
+    kuzu_connection_destroy( addr conn.handle )
+
+
+proc connect*( db: KuzuDB ): KuzuConnection =
+    ## Connect to a database.
+    result = new KuzuConnection
+    var rv = kuzu_connection_init( addr db.handle, addr result.handle )
+    if rv != KuzuSuccess:
+        raise newException( KuzuException, "Unable to connect to the database." )
+
+
+proc queryTimeout*( conn: KuzuConnection, timeout: uint64 ) =
+    ## Set a maximum time limit (in milliseconds) for query runtime.
+    discard kuzu_connection_set_query_timeout( addr conn.handle, timeout )
+
+
+proc queryInterrupt*( conn: KuzuConnection ) =
+    ## Cancel any running queries.
+    kuzu_connection_interrupt( addr conn.handle )
+
--- a/src/kuzu/constants.nim
+++ b/src/kuzu/constants.nim
@ -0,0 +1,10 @@
+# vim: set et sta sw=4 ts=4 :
+
+const KUZU_VERSION*             = "0.1.0"
+const KUZU_EXPECTED_LIBVERSION* = "0.8.2"
+
+let KUZU_LIBVERSION*      = kuzu_get_version()
+let KUZU_STORAGE_VERSION* = kuzu_get_storage_version()
+let KUZU_DEFAULT_CONFIG*  = kuzu_default_system_config()
+
+
--- a/src/kuzu/database.nim
+++ b/src/kuzu/database.nim
@ -0,0 +1,21 @@
+# vim: set et sta sw=4 ts=4 :
+
+proc `=destroy`*( db: KuzuDBObj ) =
+    ## Graceful cleanup for an open DB handle when it goes out of scope.
+    kuzu_database_destroy( addr db.handle )
+
+
+proc newKuzuDatabase*( path="", config=kuzuConfig() ): KuzuDB =
+    ## Create a new Kuzu database handle.  Creates an in-memory
+    ## database by default, but writes to disk if a +path+ is supplied.
+
+    result        = new KuzuDB
+    result.config = config
+    result.path   = if path != "" and path != ":memory:": path else: "(in-memory)"
+    result.handle = kuzu_database()
+
+    var rv = kuzu_database_init( path, config, addr result.handle )
+    if rv != KuzuSuccess:
+        raise newException( KuzuException, "Unable to open database." )
+
+
--- a/src/kuzu/queries.nim
+++ b/src/kuzu/queries.nim
@ -0,0 +1,23 @@
+# vim: set et sta sw=4 ts=4 :
+
+proc `=destroy`*( query: KuzuQueryResultObj ) =
+    ## Graceful cleanup for out of scope query objects.
+    kuzu_query_result_destroy( addr query.handle )
+    kuzu_query_summary_destroy( addr query.summary )
+
+
+proc query*( conn: KuzuConnection, query: string ): KuzuQueryResult =
+    ## Perform a database +query+ and return the result.
+    result = new KuzuQueryResult
+    var rv = kuzu_connection_query( addr conn.handle, query, addr result.handle )
+    if rv == KuzuSuccess:
+        discard kuzu_query_result_get_query_summary( addr result.handle, addr result.summary )
+        result.num_columns    = kuzu_query_result_get_num_columns( addr result.handle )
+        result.num_tuples     = kuzu_query_result_get_num_tuples( addr result.handle )
+        result.compile_time   = kuzu_query_summary_get_compiling_time( addr result.summary )
+        result.execution_time = kuzu_query_summary_get_execution_time( addr result.summary )
+    else:
+        var err = kuzu_query_result_get_error_message( addr result.handle )
+        raise newException( KuzuQueryException, &"Error running query: {err}" )
+
+
--- a/src/kuzu/types.nim
+++ b/src/kuzu/types.nim
@ -0,0 +1,25 @@
+# vim: set et sta sw=4 ts=4 :
+
+type
+    KuzuDBObj = object
+        handle*: kuzu_database
+        path*: string
+        config*: kuzu_system_config
+    KuzuDB* = ref KuzuDBObj
+
+    KuzuConnectionObj = object
+        handle*: kuzu_connection
+    KuzuConnection* = ref KuzuConnectionObj
+
+    KuzuQueryResultObj = object
+        handle*:         kuzu_query_result
+        summary:         kuzu_query_summary
+        num_columns*:    uint64 = 0
+        num_tuples*:     uint64 = 0
+        compile_time*:   cdouble = 0
+        execution_time*: cdouble = 0
+    KuzuQueryResult* = ref KuzuQueryResultObj
+
+    KuzuException* = object of CatchableError
+    KuzuQueryException* = object of KuzuException
+
--- a/tmp/.placeholder
+++ b/tmp/.placeholder