Fix the IMDB relation import.
FossilOrigin-Name: f6ae20f0aea47d2314bac182b2d49ec44d3e7055e7c0d45919da006c1ecd8baf
This commit is contained in:
parent
9ee085d86b
commit
89e879ca68
1 changed files with 41 additions and 43 deletions
|
|
@ -4,7 +4,7 @@
|
||||||
# to CSV, then imports into a Kuzu graph database.
|
# to CSV, then imports into a Kuzu graph database.
|
||||||
#
|
#
|
||||||
# Only cares about actors in movies! Things like writers,
|
# Only cares about actors in movies! Things like writers,
|
||||||
# directors, or TV shows are intentionally omitted.
|
# directors, and TV shows are intentionally omitted.
|
||||||
#
|
#
|
||||||
# Compile:
|
# Compile:
|
||||||
# % nim c -d:release imdbdata.nim
|
# % nim c -d:release imdbdata.nim
|
||||||
|
|
@ -45,11 +45,11 @@ for file in FILES:
|
||||||
|
|
||||||
case file
|
case file
|
||||||
of "name.basics":
|
of "name.basics":
|
||||||
csv_file.write( "pid,name,birthYear,deathYear\n" )
|
csv_file.write( "aid,name,birthYear,deathYear\n" )
|
||||||
of "title.basics":
|
of "title.basics":
|
||||||
csv_file.write( "mid,title,year,durationMins\n" )
|
csv_file.write( "mid,title,year,durationMins\n" )
|
||||||
of "title.principals":
|
of "title.principals":
|
||||||
csv_file.write( "pid,mid\n" )
|
csv_file.write( "aid,mid\n" )
|
||||||
|
|
||||||
var line = ""
|
var line = ""
|
||||||
while tsv_stream.readLine( line ):
|
while tsv_stream.readLine( line ):
|
||||||
|
|
@ -57,7 +57,6 @@ for file in FILES:
|
||||||
if c mod 1000 == 0: stderr.write( &"Parsing {file}... {c}\r" )
|
if c mod 1000 == 0: stderr.write( &"Parsing {file}... {c}\r" )
|
||||||
|
|
||||||
var row = line.split( '\t' )
|
var row = line.split( '\t' )
|
||||||
try:
|
|
||||||
case file
|
case file
|
||||||
|
|
||||||
# nconst primaryName birthYear deathYear primaryProfession knownForTitles
|
# nconst primaryName birthYear deathYear primaryProfession knownForTitles
|
||||||
|
|
@ -81,6 +80,7 @@ for file in FILES:
|
||||||
row = row[0..1]
|
row = row[0..1]
|
||||||
row[0] = $row[0].replace( "tt" ).parseInt()
|
row[0] = $row[0].replace( "tt" ).parseInt()
|
||||||
row[1] = $row[1].replace( "nm" ).parseInt()
|
row[1] = $row[1].replace( "nm" ).parseInt()
|
||||||
|
row = @[ row[1], row[0] ]
|
||||||
|
|
||||||
|
|
||||||
if file.contains( ".basics" ):
|
if file.contains( ".basics" ):
|
||||||
|
|
@ -92,6 +92,7 @@ for file in FILES:
|
||||||
elif it.contains( "\"" ) or it.contains( ',' ):
|
elif it.contains( "\"" ) or it.contains( ',' ):
|
||||||
var value = it
|
var value = it
|
||||||
value = value.replace( "\"", "\"\"" )
|
value = value.replace( "\"", "\"\"" )
|
||||||
|
value = value.replace( ",", "" )
|
||||||
"\"" & value & "\""
|
"\"" & value & "\""
|
||||||
|
|
||||||
else: it
|
else: it
|
||||||
|
|
@ -99,9 +100,6 @@ for file in FILES:
|
||||||
|
|
||||||
csv_file.write( row.join(","), "\n" )
|
csv_file.write( row.join(","), "\n" )
|
||||||
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
tsv_stream.close()
|
tsv_stream.close()
|
||||||
csv_file.close()
|
csv_file.close()
|
||||||
stderr.write( "\n" )
|
stderr.write( "\n" )
|
||||||
|
|
@ -118,8 +116,8 @@ if not DB.fileExists:
|
||||||
var duration = 0
|
var duration = 0
|
||||||
|
|
||||||
for schema in @[
|
for schema in @[
|
||||||
"""CREATE NODE TABLE Actor (actorId INT64, name STRING, birthYear INT, deathYear INT, PRIMARY KEY (actorId))""",
|
"""CREATE NODE TABLE Actor (actorId INT64, name STRING, birthYear UINT16, deathYear UINT16, PRIMARY KEY (actorId))""",
|
||||||
"""CREATE NODE TABLE Movie (movieId INT64, title STRING, year INT, durationMins INT, PRIMARY KEY (movieId))""",
|
"""CREATE NODE TABLE Movie (movieId INT64, title STRING, year UINT16, durationMins INT, PRIMARY KEY (movieId))""",
|
||||||
"""CREATE REL TABLE ActedIn (FROM Actor TO Movie)"""
|
"""CREATE REL TABLE ActedIn (FROM Actor TO Movie)"""
|
||||||
]:
|
]:
|
||||||
var result = conn.query( schema )
|
var result = conn.query( schema )
|
||||||
|
|
@ -129,8 +127,8 @@ if not DB.fileExists:
|
||||||
duration = 0
|
duration = 0
|
||||||
|
|
||||||
for dataload in @[
|
for dataload in @[
|
||||||
"""COPY Actor FROM "./name.basics.csv" (header=true, ignore_errors=true)""",
|
"""COPY Actor FROM "./name.basics.csv" (header=true)""",
|
||||||
"""COPY Movie FROM "./title.basics.csv" (header=true, ignore_errors=true)""",
|
"""COPY Movie FROM "./title.basics.csv" (header=true)""",
|
||||||
"""COPY ActedIn FROM "./title.principals.csv" (header=true, ignore_errors=true)"""
|
"""COPY ActedIn FROM "./title.principals.csv" (header=true, ignore_errors=true)"""
|
||||||
]:
|
]:
|
||||||
echo dataload
|
echo dataload
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue