Parse the "known for" column similarly to the "principals".

This likely generates a lot of duplicates, but catches some actors that don't
appear to be in the principals export for some reason.  The dups are eliminated
when querying.

FossilOrigin-Name: b19ec78785cc56da5551764880ee9983c2a989c04d60a2b7c402c6d13776361d
This commit is contained in:
Mahlon E. Smith 2025-04-10 03:33:42 +00:00
parent 29469dc9d4
commit cf00df149d

View file

@ -24,6 +24,9 @@ const DB = "imdb"
const SOURCE = "https://datasets.imdbws.com" const SOURCE = "https://datasets.imdbws.com"
const FILES = @[ "name.basics", "title.basics", "title.principals" ] const FILES = @[ "name.basics", "title.basics", "title.principals" ]
var knownFor: seq[ tuple[ aid: string, mids: seq[string] ] ] = @[]
# #
# Prep everything! # Prep everything!
# #
@ -62,8 +65,14 @@ for file in FILES:
# nconst primaryName birthYear deathYear primaryProfession knownForTitles # nconst primaryName birthYear deathYear primaryProfession knownForTitles
of "name.basics": of "name.basics":
var id = $row[0].replace( "nm" ).parseInt()
var known = row[5].split( ',' )
known.applyIt(
$it.replace( "tt" ).parseInt()
)
knownFor.add( (aid: id, mids: known ) )
row[0] = id
row = row[0..3] row = row[0..3]
row[0] = $row[0].replace( "nm" ).parseInt()
# tconst titleType primaryTitle originalTitle isAdult startYear endYear runtimeMinutes genres # tconst titleType primaryTitle originalTitle isAdult startYear endYear runtimeMinutes genres
of "title.basics": of "title.basics":
@ -108,6 +117,13 @@ for file in FILES:
csv_file.close() csv_file.close()
stderr.write( "\n" ) stderr.write( "\n" )
let known_file = open( "known.principals.csv", fmWrite )
known_file.write( "aid,mid\n" )
for known in knownFor:
for mid in known.mids:
known_file.write &"{known.aid},{mid}\n"
known_file.close()
# #
# Ok, now import into a fresh kuzu database. # Ok, now import into a fresh kuzu database.
@ -132,7 +148,8 @@ if not DB.dirExists:
for dataload in @[ for dataload in @[
"""COPY Actor FROM "./name.basics.csv" (header=true)""", """COPY Actor FROM "./name.basics.csv" (header=true)""",
"""COPY Movie FROM "./title.basics.csv" (header=true)""", """COPY Movie FROM "./title.basics.csv" (header=true)""",
"""COPY ActedIn FROM "./title.principals.csv" (header=true, ignore_errors=true)""" """COPY ActedIn FROM "./title.principals.csv" (header=true, ignore_errors=true)""",
"""COPY ActedIn FROM "./known.principals.csv" (header=true, ignore_errors=true)"""
]: ]:
echo dataload echo dataload
var q = conn.query( dataload ) var q = conn.query( dataload )