Parse the "known for" column similarly to the "principals".
This likely generates a lot of duplicates, but catches some actors that don't appear to be in the principals export for some reason. The dups are eliminated when querying. FossilOrigin-Name: b19ec78785cc56da5551764880ee9983c2a989c04d60a2b7c402c6d13776361d
This commit is contained in:
parent
29469dc9d4
commit
cf00df149d
1 changed files with 19 additions and 2 deletions
|
|
@ -24,6 +24,9 @@ const DB = "imdb"
|
||||||
const SOURCE = "https://datasets.imdbws.com"
|
const SOURCE = "https://datasets.imdbws.com"
|
||||||
const FILES = @[ "name.basics", "title.basics", "title.principals" ]
|
const FILES = @[ "name.basics", "title.basics", "title.principals" ]
|
||||||
|
|
||||||
|
var knownFor: seq[ tuple[ aid: string, mids: seq[string] ] ] = @[]
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Prep everything!
|
# Prep everything!
|
||||||
#
|
#
|
||||||
|
|
@ -62,8 +65,14 @@ for file in FILES:
|
||||||
|
|
||||||
# nconst primaryName birthYear deathYear primaryProfession knownForTitles
|
# nconst primaryName birthYear deathYear primaryProfession knownForTitles
|
||||||
of "name.basics":
|
of "name.basics":
|
||||||
|
var id = $row[0].replace( "nm" ).parseInt()
|
||||||
|
var known = row[5].split( ',' )
|
||||||
|
known.applyIt(
|
||||||
|
$it.replace( "tt" ).parseInt()
|
||||||
|
)
|
||||||
|
knownFor.add( (aid: id, mids: known ) )
|
||||||
|
row[0] = id
|
||||||
row = row[0..3]
|
row = row[0..3]
|
||||||
row[0] = $row[0].replace( "nm" ).parseInt()
|
|
||||||
|
|
||||||
# tconst titleType primaryTitle originalTitle isAdult startYear endYear runtimeMinutes genres
|
# tconst titleType primaryTitle originalTitle isAdult startYear endYear runtimeMinutes genres
|
||||||
of "title.basics":
|
of "title.basics":
|
||||||
|
|
@ -108,6 +117,13 @@ for file in FILES:
|
||||||
csv_file.close()
|
csv_file.close()
|
||||||
stderr.write( "\n" )
|
stderr.write( "\n" )
|
||||||
|
|
||||||
|
let known_file = open( "known.principals.csv", fmWrite )
|
||||||
|
known_file.write( "aid,mid\n" )
|
||||||
|
for known in knownFor:
|
||||||
|
for mid in known.mids:
|
||||||
|
known_file.write &"{known.aid},{mid}\n"
|
||||||
|
known_file.close()
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Ok, now import into a fresh kuzu database.
|
# Ok, now import into a fresh kuzu database.
|
||||||
|
|
@ -132,7 +148,8 @@ if not DB.dirExists:
|
||||||
for dataload in @[
|
for dataload in @[
|
||||||
"""COPY Actor FROM "./name.basics.csv" (header=true)""",
|
"""COPY Actor FROM "./name.basics.csv" (header=true)""",
|
||||||
"""COPY Movie FROM "./title.basics.csv" (header=true)""",
|
"""COPY Movie FROM "./title.basics.csv" (header=true)""",
|
||||||
"""COPY ActedIn FROM "./title.principals.csv" (header=true, ignore_errors=true)"""
|
"""COPY ActedIn FROM "./title.principals.csv" (header=true, ignore_errors=true)""",
|
||||||
|
"""COPY ActedIn FROM "./known.principals.csv" (header=true, ignore_errors=true)"""
|
||||||
]:
|
]:
|
||||||
echo dataload
|
echo dataload
|
||||||
var q = conn.query( dataload )
|
var q = conn.query( dataload )
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue