Initial commit, adding work to date.

FossilOrigin-Name: e866fb59b96496de50c85d483ec89bdb8f48fb2d24c43b8bfecdc0a968962114
This commit is contained in:
mahlon 2025-03-15 20:26:58 +00:00
parent ef8d5a489e
commit 9ee085d86b
12 changed files with 2486 additions and 0 deletions

1
README.md Normal file
View file

@ -0,0 +1 @@
**TBD**

View file

@ -0,0 +1,145 @@
# vim: set et sta sw=4 ts=4 :
#
# Fetches movie and actor data from IMDB and converts it
# to CSV, then imports into a Kuzu graph database.
#
# Only cares about actors in movies! Things like writers,
# directors, or TV shows are intentionally omitted.
#
# Compile:
# % nim c -d:release imdbdata.nim
#
# Sourced from: https://datasets.imdbws.com/
# See: https://developer.imdb.com/non-commercial-datasets/
import
std/os,
std/sequtils,
std/strformat,
std/strutils,
zip/gzipfiles,
kuzu
const DB = "imdb"
const SOURCE = "https://datasets.imdbws.com"
const FILES = @[ "name.basics", "title.basics", "title.principals" ]
#
# Prep everything!
#
for file in FILES:
var c = 0
let tsvgz = &"{file}.tsv.gz"
let csv = &"{file}.csv"
if csv.fileExists:
echo &"Skipping {file}, csv already exists."
continue
if not tsvgz.fileExists:
echo &"Downloading file: {file}..."
discard execShellCmd &"wget {SOURCE}/{tsvgz}"
let tsv_stream = newGzFileStream( tsvgz )
let csv_file = open( &"{file}.csv", fmWrite )
case file
of "name.basics":
csv_file.write( "pid,name,birthYear,deathYear\n" )
of "title.basics":
csv_file.write( "mid,title,year,durationMins\n" )
of "title.principals":
csv_file.write( "pid,mid\n" )
var line = ""
while tsv_stream.readLine( line ):
c += 1
if c mod 1000 == 0: stderr.write( &"Parsing {file}... {c}\r" )
var row = line.split( '\t' )
try:
case file
# nconst primaryName birthYear deathYear primaryProfession knownForTitles
of "name.basics":
row = row[0..3]
row[0] = $row[0].replace( "nm" ).parseInt()
# tconst titleType primaryTitle originalTitle isAdult startYear endYear runtimeMinutes genres
of "title.basics":
if row[1] != "movie": continue
row.delete( 1 )
for i in 0..1: row.delete( 2 )
row.delete( 3 )
discard row.pop()
row[0] = $row[0].replace( "tt" ).parseInt()
# tconst ordering nconst category job characters
of "title.principals":
if row[3] != "actor" and row[3] != "actress": continue
row.delete( 1 )
row = row[0..1]
row[0] = $row[0].replace( "tt" ).parseInt()
row[1] = $row[1].replace( "nm" ).parseInt()
if file.contains( ".basics" ):
row.applyIt(
# empty value / null
if it == "\\N": ""
# RFC 4180 escapes
elif it.contains( "\"" ) or it.contains( ',' ):
var value = it
value = value.replace( "\"", "\"\"" )
"\"" & value & "\""
else: it
)
csv_file.write( row.join(","), "\n" )
except ValueError:
continue
tsv_stream.close()
csv_file.close()
stderr.write( "\n" )
#
# Ok, now import into a fresh kuzu database.
#
var db = newKuzuDatabase( DB )
var conn = db.connect()
if not DB.fileExists:
var duration = 0
for schema in @[
"""CREATE NODE TABLE Actor (actorId INT64, name STRING, birthYear INT, deathYear INT, PRIMARY KEY (actorId))""",
"""CREATE NODE TABLE Movie (movieId INT64, title STRING, year INT, durationMins INT, PRIMARY KEY (movieId))""",
"""CREATE REL TABLE ActedIn (FROM Actor TO Movie)"""
]:
var result = conn.query( schema )
duration += result.execution_time.int
echo &"Created database schema in {duration}ms."
duration = 0
for dataload in @[
"""COPY Actor FROM "./name.basics.csv" (header=true, ignore_errors=true)""",
"""COPY Movie FROM "./title.basics.csv" (header=true, ignore_errors=true)""",
"""COPY ActedIn FROM "./title.principals.csv" (header=true, ignore_errors=true)"""
]:
echo dataload
var result = conn.query( dataload )
duration += result.execution_time.int
echo &"Imported data in {duration / 1000}s."
echo "Done!"
else:
echo &"Database appears to already exist, skipping data import."

17
kuzu.nimble Normal file
View file

@ -0,0 +1,17 @@
# vim: set et sta sw=4 ts=4 :
version = "0.1.0"
author = "Mahlon E. Smith"
description = "Kuzu is an embedded graph database built for query speed and scalability."
license = "MIT"
srcDir = "src"
requires "nim ^= 2.0.0"
# Development dependencies.
#requires "futhark ^= 0.15.0"
#requires "zip ^= 0.3.1"
task makewrapper, "Generate the C wrapper using Futhark":
exec "nim c -d:futharkWrap --outdir=. src/kuzu.nim"

36
src/kuzu.nim Normal file
View file

@ -0,0 +1,36 @@
# vim: set et sta sw=4 ts=4 :
#
{.passL:"-lkuzu".}
when defined( futharkWrap ):
import futhark, os
importc:
outputPath currentSourcePath.parentDir / "kuzu" / "0.8.2.nim"
"kuzu.h"
else:
include "kuzu/0.8.2.nim"
import
std/strformat
include
"kuzu/constants.nim",
"kuzu/types.nim",
"kuzu/config.nim",
"kuzu/database.nim",
"kuzu/connection.nim",
"kuzu/queries.nim"
when isMainModule:
echo "Nim-Kuzu version: ", KUZU_VERSION,
". Expected library version: ", KUZU_EXPECTED_LIBVERSION, "."
echo "Installed Kuzu library version ", KUZU_LIBVERSION,
" (storage version ", KUZU_STORAGE_VERSION, ")"
if KUZU_EXPECTED_LIBVERSION == KUZU_LIBVERSION:
echo "Versions match!"
else:
echo "This library wraps a different version of Kuzu than what is installed."
echo "Behavior may be unexpected!"

2160
src/kuzu/0.8.2.nim Normal file

File diff suppressed because it is too large Load diff

24
src/kuzu/config.nim Normal file
View file

@ -0,0 +1,24 @@
# vim: set et sta sw=4 ts=4 :
proc kuzuConfig*(
buffer_pool_size = KUZU_DEFAULT_CONFIG.buffer_pool_size,
max_num_threads = KUZU_DEFAULT_CONFIG.max_num_threads,
enable_compression = KUZU_DEFAULT_CONFIG.enable_compression,
read_only = KUZU_DEFAULT_CONFIG.read_only,
max_db_size = KUZU_DEFAULT_CONFIG.max_db_size,
auto_checkpoint = KUZU_DEFAULT_CONFIG.auto_checkpoint,
checkpoint_threshold = KUZU_DEFAULT_CONFIG.checkpoint_threshold
): kuzu_system_config =
## Returns a new kuzu database configuration object.
return kuzu_system_config(
buffer_pool_size: buffer_pool_size,
max_num_threads: max_num_threads,
enable_compression: enable_compression,
read_only: read_only,
max_db_size: max_db_size,
auto_checkpoint: auto_checkpoint,
checkpoint_threshold: checkpoint_threshold
)

24
src/kuzu/connection.nim Normal file
View file

@ -0,0 +1,24 @@
# vim: set et sta sw=4 ts=4 :
proc `=destroy`*( conn: KuzuConnectionObj ) =
## Graceful cleanup for open connection handles.
kuzu_connection_destroy( addr conn.handle )
proc connect*( db: KuzuDB ): KuzuConnection =
## Connect to a database.
result = new KuzuConnection
var rv = kuzu_connection_init( addr db.handle, addr result.handle )
if rv != KuzuSuccess:
raise newException( KuzuException, "Unable to connect to the database." )
proc queryTimeout*( conn: KuzuConnection, timeout: uint64 ) =
## Set a maximum time limit (in milliseconds) for query runtime.
discard kuzu_connection_set_query_timeout( addr conn.handle, timeout )
proc queryInterrupt*( conn: KuzuConnection ) =
## Cancel any running queries.
kuzu_connection_interrupt( addr conn.handle )

10
src/kuzu/constants.nim Normal file
View file

@ -0,0 +1,10 @@
# vim: set et sta sw=4 ts=4 :
const KUZU_VERSION* = "0.1.0"
const KUZU_EXPECTED_LIBVERSION* = "0.8.2"
let KUZU_LIBVERSION* = kuzu_get_version()
let KUZU_STORAGE_VERSION* = kuzu_get_storage_version()
let KUZU_DEFAULT_CONFIG* = kuzu_default_system_config()

21
src/kuzu/database.nim Normal file
View file

@ -0,0 +1,21 @@
# vim: set et sta sw=4 ts=4 :
proc `=destroy`*( db: KuzuDBObj ) =
## Graceful cleanup for an open DB handle when it goes out of scope.
kuzu_database_destroy( addr db.handle )
proc newKuzuDatabase*( path="", config=kuzuConfig() ): KuzuDB =
## Create a new Kuzu database handle. Creates an in-memory
## database by default, but writes to disk if a +path+ is supplied.
result = new KuzuDB
result.config = config
result.path = if path != "" and path != ":memory:": path else: "(in-memory)"
result.handle = kuzu_database()
var rv = kuzu_database_init( path, config, addr result.handle )
if rv != KuzuSuccess:
raise newException( KuzuException, "Unable to open database." )

23
src/kuzu/queries.nim Normal file
View file

@ -0,0 +1,23 @@
# vim: set et sta sw=4 ts=4 :
proc `=destroy`*( query: KuzuQueryResultObj ) =
## Graceful cleanup for out of scope query objects.
kuzu_query_result_destroy( addr query.handle )
kuzu_query_summary_destroy( addr query.summary )
proc query*( conn: KuzuConnection, query: string ): KuzuQueryResult =
## Perform a database +query+ and return the result.
result = new KuzuQueryResult
var rv = kuzu_connection_query( addr conn.handle, query, addr result.handle )
if rv == KuzuSuccess:
discard kuzu_query_result_get_query_summary( addr result.handle, addr result.summary )
result.num_columns = kuzu_query_result_get_num_columns( addr result.handle )
result.num_tuples = kuzu_query_result_get_num_tuples( addr result.handle )
result.compile_time = kuzu_query_summary_get_compiling_time( addr result.summary )
result.execution_time = kuzu_query_summary_get_execution_time( addr result.summary )
else:
var err = kuzu_query_result_get_error_message( addr result.handle )
raise newException( KuzuQueryException, &"Error running query: {err}" )

25
src/kuzu/types.nim Normal file
View file

@ -0,0 +1,25 @@
# vim: set et sta sw=4 ts=4 :
type
KuzuDBObj = object
handle*: kuzu_database
path*: string
config*: kuzu_system_config
KuzuDB* = ref KuzuDBObj
KuzuConnectionObj = object
handle*: kuzu_connection
KuzuConnection* = ref KuzuConnectionObj
KuzuQueryResultObj = object
handle*: kuzu_query_result
summary: kuzu_query_summary
num_columns*: uint64 = 0
num_tuples*: uint64 = 0
compile_time*: cdouble = 0
execution_time*: cdouble = 0
KuzuQueryResult* = ref KuzuQueryResultObj
KuzuException* = object of CatchableError
KuzuQueryException* = object of KuzuException

0
tmp/.placeholder Normal file
View file