Initial commit, adding work to date.
FossilOrigin-Name: e866fb59b96496de50c85d483ec89bdb8f48fb2d24c43b8bfecdc0a968962114
This commit is contained in:
parent
ef8d5a489e
commit
9ee085d86b
12 changed files with 2486 additions and 0 deletions
1
README.md
Normal file
1
README.md
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
**TBD**
|
||||||
145
experiments/imdb/imdbimport.nim
Normal file
145
experiments/imdb/imdbimport.nim
Normal file
|
|
@ -0,0 +1,145 @@
|
||||||
|
# vim: set et sta sw=4 ts=4 :
|
||||||
|
#
|
||||||
|
# Fetches movie and actor data from IMDB and converts it
|
||||||
|
# to CSV, then imports into a Kuzu graph database.
|
||||||
|
#
|
||||||
|
# Only cares about actors in movies! Things like writers,
|
||||||
|
# directors, or TV shows are intentionally omitted.
|
||||||
|
#
|
||||||
|
# Compile:
|
||||||
|
# % nim c -d:release imdbdata.nim
|
||||||
|
#
|
||||||
|
# Sourced from: https://datasets.imdbws.com/
|
||||||
|
# See: https://developer.imdb.com/non-commercial-datasets/
|
||||||
|
|
||||||
|
import
|
||||||
|
std/os,
|
||||||
|
std/sequtils,
|
||||||
|
std/strformat,
|
||||||
|
std/strutils,
|
||||||
|
zip/gzipfiles,
|
||||||
|
kuzu
|
||||||
|
|
||||||
|
const DB = "imdb"
|
||||||
|
const SOURCE = "https://datasets.imdbws.com"
|
||||||
|
const FILES = @[ "name.basics", "title.basics", "title.principals" ]
|
||||||
|
|
||||||
|
#
|
||||||
|
# Prep everything!
|
||||||
|
#
|
||||||
|
for file in FILES:
|
||||||
|
var c = 0
|
||||||
|
let tsvgz = &"{file}.tsv.gz"
|
||||||
|
let csv = &"{file}.csv"
|
||||||
|
|
||||||
|
if csv.fileExists:
|
||||||
|
echo &"Skipping {file}, csv already exists."
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not tsvgz.fileExists:
|
||||||
|
echo &"Downloading file: {file}..."
|
||||||
|
discard execShellCmd &"wget {SOURCE}/{tsvgz}"
|
||||||
|
|
||||||
|
let tsv_stream = newGzFileStream( tsvgz )
|
||||||
|
let csv_file = open( &"{file}.csv", fmWrite )
|
||||||
|
|
||||||
|
case file
|
||||||
|
of "name.basics":
|
||||||
|
csv_file.write( "pid,name,birthYear,deathYear\n" )
|
||||||
|
of "title.basics":
|
||||||
|
csv_file.write( "mid,title,year,durationMins\n" )
|
||||||
|
of "title.principals":
|
||||||
|
csv_file.write( "pid,mid\n" )
|
||||||
|
|
||||||
|
var line = ""
|
||||||
|
while tsv_stream.readLine( line ):
|
||||||
|
c += 1
|
||||||
|
if c mod 1000 == 0: stderr.write( &"Parsing {file}... {c}\r" )
|
||||||
|
|
||||||
|
var row = line.split( '\t' )
|
||||||
|
try:
|
||||||
|
case file
|
||||||
|
|
||||||
|
# nconst primaryName birthYear deathYear primaryProfession knownForTitles
|
||||||
|
of "name.basics":
|
||||||
|
row = row[0..3]
|
||||||
|
row[0] = $row[0].replace( "nm" ).parseInt()
|
||||||
|
|
||||||
|
# tconst titleType primaryTitle originalTitle isAdult startYear endYear runtimeMinutes genres
|
||||||
|
of "title.basics":
|
||||||
|
if row[1] != "movie": continue
|
||||||
|
row.delete( 1 )
|
||||||
|
for i in 0..1: row.delete( 2 )
|
||||||
|
row.delete( 3 )
|
||||||
|
discard row.pop()
|
||||||
|
row[0] = $row[0].replace( "tt" ).parseInt()
|
||||||
|
|
||||||
|
# tconst ordering nconst category job characters
|
||||||
|
of "title.principals":
|
||||||
|
if row[3] != "actor" and row[3] != "actress": continue
|
||||||
|
row.delete( 1 )
|
||||||
|
row = row[0..1]
|
||||||
|
row[0] = $row[0].replace( "tt" ).parseInt()
|
||||||
|
row[1] = $row[1].replace( "nm" ).parseInt()
|
||||||
|
|
||||||
|
|
||||||
|
if file.contains( ".basics" ):
|
||||||
|
row.applyIt(
|
||||||
|
# empty value / null
|
||||||
|
if it == "\\N": ""
|
||||||
|
|
||||||
|
# RFC 4180 escapes
|
||||||
|
elif it.contains( "\"" ) or it.contains( ',' ):
|
||||||
|
var value = it
|
||||||
|
value = value.replace( "\"", "\"\"" )
|
||||||
|
"\"" & value & "\""
|
||||||
|
|
||||||
|
else: it
|
||||||
|
)
|
||||||
|
|
||||||
|
csv_file.write( row.join(","), "\n" )
|
||||||
|
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
tsv_stream.close()
|
||||||
|
csv_file.close()
|
||||||
|
stderr.write( "\n" )
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Ok, now import into a fresh kuzu database.
|
||||||
|
#
|
||||||
|
|
||||||
|
var db = newKuzuDatabase( DB )
|
||||||
|
var conn = db.connect()
|
||||||
|
|
||||||
|
if not DB.fileExists:
|
||||||
|
var duration = 0
|
||||||
|
|
||||||
|
for schema in @[
|
||||||
|
"""CREATE NODE TABLE Actor (actorId INT64, name STRING, birthYear INT, deathYear INT, PRIMARY KEY (actorId))""",
|
||||||
|
"""CREATE NODE TABLE Movie (movieId INT64, title STRING, year INT, durationMins INT, PRIMARY KEY (movieId))""",
|
||||||
|
"""CREATE REL TABLE ActedIn (FROM Actor TO Movie)"""
|
||||||
|
]:
|
||||||
|
var result = conn.query( schema )
|
||||||
|
duration += result.execution_time.int
|
||||||
|
|
||||||
|
echo &"Created database schema in {duration}ms."
|
||||||
|
duration = 0
|
||||||
|
|
||||||
|
for dataload in @[
|
||||||
|
"""COPY Actor FROM "./name.basics.csv" (header=true, ignore_errors=true)""",
|
||||||
|
"""COPY Movie FROM "./title.basics.csv" (header=true, ignore_errors=true)""",
|
||||||
|
"""COPY ActedIn FROM "./title.principals.csv" (header=true, ignore_errors=true)"""
|
||||||
|
]:
|
||||||
|
echo dataload
|
||||||
|
var result = conn.query( dataload )
|
||||||
|
duration += result.execution_time.int
|
||||||
|
|
||||||
|
echo &"Imported data in {duration / 1000}s."
|
||||||
|
echo "Done!"
|
||||||
|
|
||||||
|
else:
|
||||||
|
echo &"Database appears to already exist, skipping data import."
|
||||||
|
|
||||||
17
kuzu.nimble
Normal file
17
kuzu.nimble
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
# vim: set et sta sw=4 ts=4 :
|
||||||
|
|
||||||
|
version = "0.1.0"
|
||||||
|
author = "Mahlon E. Smith"
|
||||||
|
description = "Kuzu is an embedded graph database built for query speed and scalability."
|
||||||
|
license = "MIT"
|
||||||
|
srcDir = "src"
|
||||||
|
|
||||||
|
requires "nim ^= 2.0.0"
|
||||||
|
|
||||||
|
# Development dependencies.
|
||||||
|
#requires "futhark ^= 0.15.0"
|
||||||
|
#requires "zip ^= 0.3.1"
|
||||||
|
|
||||||
|
task makewrapper, "Generate the C wrapper using Futhark":
|
||||||
|
exec "nim c -d:futharkWrap --outdir=. src/kuzu.nim"
|
||||||
|
|
||||||
36
src/kuzu.nim
Normal file
36
src/kuzu.nim
Normal file
|
|
@ -0,0 +1,36 @@
|
||||||
|
# vim: set et sta sw=4 ts=4 :
|
||||||
|
#
|
||||||
|
|
||||||
|
{.passL:"-lkuzu".}
|
||||||
|
|
||||||
|
when defined( futharkWrap ):
|
||||||
|
import futhark, os
|
||||||
|
|
||||||
|
importc:
|
||||||
|
outputPath currentSourcePath.parentDir / "kuzu" / "0.8.2.nim"
|
||||||
|
"kuzu.h"
|
||||||
|
else:
|
||||||
|
include "kuzu/0.8.2.nim"
|
||||||
|
|
||||||
|
import
|
||||||
|
std/strformat
|
||||||
|
|
||||||
|
include
|
||||||
|
"kuzu/constants.nim",
|
||||||
|
"kuzu/types.nim",
|
||||||
|
"kuzu/config.nim",
|
||||||
|
"kuzu/database.nim",
|
||||||
|
"kuzu/connection.nim",
|
||||||
|
"kuzu/queries.nim"
|
||||||
|
|
||||||
|
when isMainModule:
|
||||||
|
echo "Nim-Kuzu version: ", KUZU_VERSION,
|
||||||
|
". Expected library version: ", KUZU_EXPECTED_LIBVERSION, "."
|
||||||
|
echo "Installed Kuzu library version ", KUZU_LIBVERSION,
|
||||||
|
" (storage version ", KUZU_STORAGE_VERSION, ")"
|
||||||
|
if KUZU_EXPECTED_LIBVERSION == KUZU_LIBVERSION:
|
||||||
|
echo "Versions match!"
|
||||||
|
else:
|
||||||
|
echo "This library wraps a different version of Kuzu than what is installed."
|
||||||
|
echo "Behavior may be unexpected!"
|
||||||
|
|
||||||
2160
src/kuzu/0.8.2.nim
Normal file
2160
src/kuzu/0.8.2.nim
Normal file
File diff suppressed because it is too large
Load diff
24
src/kuzu/config.nim
Normal file
24
src/kuzu/config.nim
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
# vim: set et sta sw=4 ts=4 :
|
||||||
|
|
||||||
|
proc kuzuConfig*(
|
||||||
|
buffer_pool_size = KUZU_DEFAULT_CONFIG.buffer_pool_size,
|
||||||
|
max_num_threads = KUZU_DEFAULT_CONFIG.max_num_threads,
|
||||||
|
enable_compression = KUZU_DEFAULT_CONFIG.enable_compression,
|
||||||
|
read_only = KUZU_DEFAULT_CONFIG.read_only,
|
||||||
|
max_db_size = KUZU_DEFAULT_CONFIG.max_db_size,
|
||||||
|
auto_checkpoint = KUZU_DEFAULT_CONFIG.auto_checkpoint,
|
||||||
|
checkpoint_threshold = KUZU_DEFAULT_CONFIG.checkpoint_threshold
|
||||||
|
): kuzu_system_config =
|
||||||
|
## Returns a new kuzu database configuration object.
|
||||||
|
|
||||||
|
return kuzu_system_config(
|
||||||
|
buffer_pool_size: buffer_pool_size,
|
||||||
|
max_num_threads: max_num_threads,
|
||||||
|
enable_compression: enable_compression,
|
||||||
|
read_only: read_only,
|
||||||
|
max_db_size: max_db_size,
|
||||||
|
auto_checkpoint: auto_checkpoint,
|
||||||
|
checkpoint_threshold: checkpoint_threshold
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
24
src/kuzu/connection.nim
Normal file
24
src/kuzu/connection.nim
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
# vim: set et sta sw=4 ts=4 :
|
||||||
|
|
||||||
|
proc `=destroy`*( conn: KuzuConnectionObj ) =
|
||||||
|
## Graceful cleanup for open connection handles.
|
||||||
|
kuzu_connection_destroy( addr conn.handle )
|
||||||
|
|
||||||
|
|
||||||
|
proc connect*( db: KuzuDB ): KuzuConnection =
|
||||||
|
## Connect to a database.
|
||||||
|
result = new KuzuConnection
|
||||||
|
var rv = kuzu_connection_init( addr db.handle, addr result.handle )
|
||||||
|
if rv != KuzuSuccess:
|
||||||
|
raise newException( KuzuException, "Unable to connect to the database." )
|
||||||
|
|
||||||
|
|
||||||
|
proc queryTimeout*( conn: KuzuConnection, timeout: uint64 ) =
|
||||||
|
## Set a maximum time limit (in milliseconds) for query runtime.
|
||||||
|
discard kuzu_connection_set_query_timeout( addr conn.handle, timeout )
|
||||||
|
|
||||||
|
|
||||||
|
proc queryInterrupt*( conn: KuzuConnection ) =
|
||||||
|
## Cancel any running queries.
|
||||||
|
kuzu_connection_interrupt( addr conn.handle )
|
||||||
|
|
||||||
10
src/kuzu/constants.nim
Normal file
10
src/kuzu/constants.nim
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
# vim: set et sta sw=4 ts=4 :
|
||||||
|
|
||||||
|
const KUZU_VERSION* = "0.1.0"
|
||||||
|
const KUZU_EXPECTED_LIBVERSION* = "0.8.2"
|
||||||
|
|
||||||
|
let KUZU_LIBVERSION* = kuzu_get_version()
|
||||||
|
let KUZU_STORAGE_VERSION* = kuzu_get_storage_version()
|
||||||
|
let KUZU_DEFAULT_CONFIG* = kuzu_default_system_config()
|
||||||
|
|
||||||
|
|
||||||
21
src/kuzu/database.nim
Normal file
21
src/kuzu/database.nim
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
# vim: set et sta sw=4 ts=4 :
|
||||||
|
|
||||||
|
proc `=destroy`*( db: KuzuDBObj ) =
|
||||||
|
## Graceful cleanup for an open DB handle when it goes out of scope.
|
||||||
|
kuzu_database_destroy( addr db.handle )
|
||||||
|
|
||||||
|
|
||||||
|
proc newKuzuDatabase*( path="", config=kuzuConfig() ): KuzuDB =
|
||||||
|
## Create a new Kuzu database handle. Creates an in-memory
|
||||||
|
## database by default, but writes to disk if a +path+ is supplied.
|
||||||
|
|
||||||
|
result = new KuzuDB
|
||||||
|
result.config = config
|
||||||
|
result.path = if path != "" and path != ":memory:": path else: "(in-memory)"
|
||||||
|
result.handle = kuzu_database()
|
||||||
|
|
||||||
|
var rv = kuzu_database_init( path, config, addr result.handle )
|
||||||
|
if rv != KuzuSuccess:
|
||||||
|
raise newException( KuzuException, "Unable to open database." )
|
||||||
|
|
||||||
|
|
||||||
23
src/kuzu/queries.nim
Normal file
23
src/kuzu/queries.nim
Normal file
|
|
@ -0,0 +1,23 @@
|
||||||
|
# vim: set et sta sw=4 ts=4 :
|
||||||
|
|
||||||
|
proc `=destroy`*( query: KuzuQueryResultObj ) =
|
||||||
|
## Graceful cleanup for out of scope query objects.
|
||||||
|
kuzu_query_result_destroy( addr query.handle )
|
||||||
|
kuzu_query_summary_destroy( addr query.summary )
|
||||||
|
|
||||||
|
|
||||||
|
proc query*( conn: KuzuConnection, query: string ): KuzuQueryResult =
|
||||||
|
## Perform a database +query+ and return the result.
|
||||||
|
result = new KuzuQueryResult
|
||||||
|
var rv = kuzu_connection_query( addr conn.handle, query, addr result.handle )
|
||||||
|
if rv == KuzuSuccess:
|
||||||
|
discard kuzu_query_result_get_query_summary( addr result.handle, addr result.summary )
|
||||||
|
result.num_columns = kuzu_query_result_get_num_columns( addr result.handle )
|
||||||
|
result.num_tuples = kuzu_query_result_get_num_tuples( addr result.handle )
|
||||||
|
result.compile_time = kuzu_query_summary_get_compiling_time( addr result.summary )
|
||||||
|
result.execution_time = kuzu_query_summary_get_execution_time( addr result.summary )
|
||||||
|
else:
|
||||||
|
var err = kuzu_query_result_get_error_message( addr result.handle )
|
||||||
|
raise newException( KuzuQueryException, &"Error running query: {err}" )
|
||||||
|
|
||||||
|
|
||||||
25
src/kuzu/types.nim
Normal file
25
src/kuzu/types.nim
Normal file
|
|
@ -0,0 +1,25 @@
|
||||||
|
# vim: set et sta sw=4 ts=4 :
|
||||||
|
|
||||||
|
type
|
||||||
|
KuzuDBObj = object
|
||||||
|
handle*: kuzu_database
|
||||||
|
path*: string
|
||||||
|
config*: kuzu_system_config
|
||||||
|
KuzuDB* = ref KuzuDBObj
|
||||||
|
|
||||||
|
KuzuConnectionObj = object
|
||||||
|
handle*: kuzu_connection
|
||||||
|
KuzuConnection* = ref KuzuConnectionObj
|
||||||
|
|
||||||
|
KuzuQueryResultObj = object
|
||||||
|
handle*: kuzu_query_result
|
||||||
|
summary: kuzu_query_summary
|
||||||
|
num_columns*: uint64 = 0
|
||||||
|
num_tuples*: uint64 = 0
|
||||||
|
compile_time*: cdouble = 0
|
||||||
|
execution_time*: cdouble = 0
|
||||||
|
KuzuQueryResult* = ref KuzuQueryResultObj
|
||||||
|
|
||||||
|
KuzuException* = object of CatchableError
|
||||||
|
KuzuQueryException* = object of KuzuException
|
||||||
|
|
||||||
0
tmp/.placeholder
Normal file
0
tmp/.placeholder
Normal file
Loading…
Add table
Add a link
Reference in a new issue