ucsc1996¶

This dataset comes from DreamBank. It is the UCSC women, 1996 dataset and comes from an archived copy of DreamBank hosted on Zenodo. See the krank GitHub Issue and the ucsc1996 docs page for more info.

Setup¶

In [ ]:

Copied!

import os
from datetime import datetime, timezone

import pandas as pd
import pooch
import os
from datetime import datetime, timezone

import pandas as pd
import pooch

In [2]:

Copied!

DATASETS = ["ucsc_women"]
DATASETS = ["ucsc_women"]

Load¶

In [3]:

Copied!





# Retrieve the latest datasets from GitHub releases
# OR from local file if during development of a new release.
DREAMBANK_VERSION = "v1.0.0-alpha5"
dreams_kwargs = {
    "url": f"https://github.com/remrama/dreambank/releases/download/{DREAMBANK_VERSION}/dreams.csv.xz",
    "known_hash": "md5:2dcab92f9d9515df174388babb5c9e5a",
    "progressbar": True,
}

dreams_fname = pooch.retrieve(**dreams_kwargs)
# dreams_fname = "../output/dreams.csv.xz"
# Retrieve the latest datasets from GitHub releases
# OR from local file if during development of a new release.
DREAMBANK_VERSION = "v1.0.0-alpha5"
dreams_kwargs = {
    "url": f"https://github.com/remrama/dreambank/releases/download/{DREAMBANK_VERSION}/dreams.csv.xz",
    "known_hash": "md5:2dcab92f9d9515df174388babb5c9e5a",
    "progressbar": True,
}

dreams_fname = pooch.retrieve(**dreams_kwargs)
# dreams_fname = "../output/dreams.csv.xz"

In [4]:

Copied!

dreams = pd.read_csv(dreams_fname)
dreams = pd.read_csv(dreams_fname)

Process¶

In [5]:

Copied!

dreams = dreams[dreams["dataset"].isin(DATASETS)].copy()
dreams = dreams[dreams["dataset"].isin(DATASETS)].copy()

In [6]:

Copied!





# Verify assumptions about the data
assert dreams["metadata"].str.match(r"^[12][8901234]/female$").all()
assert dreams["dream_id"].str.match(r"^[1-9][0-9]?[a-b]?$").all()

# Verify that the age and sex metadata are consistent for the a/b person
dreams[dreams["dream_id"].str.len() > 2]
# Verify assumptions about the data
assert dreams["metadata"].str.match(r"^[12][8901234]/female$").all()
assert dreams["dream_id"].str.match(r"^[1-9][0-9]?[a-b]?$").all()

# Verify that the age and sex metadata are consistent for the a/b person
dreams[dreams["dream_id"].str.len() > 2]

Out[6]:

	dataset	dream_id	metadata	word_count	dream_text
37182	ucsc_women	16a	20/female	124	In part of the dream I was arguing with my now...
37183	ucsc_women	16b	20/female	60	I dreamt that my ex-boyfriend forgave me for b...

In [7]:

Copied!





# Create author column by extracting digits from dream_id (keep as string)
dreams["author"] = dreams["dream_id"].str.extract(r"^([1-9][0-9]?)[a-b]?$")[0].astype(str)
# Create age column by extracting from metadata
dreams["age"] = dreams["metadata"].str.extract(r"^([12][8901234])/female$")[0].astype(int)
# Create constant sex column
dreams["sex"] = "female"
# Create author column by extracting digits from dream_id (keep as string)
dreams["author"] = dreams["dream_id"].str.extract(r"^([1-9][0-9]?)[a-b]?$")[0].astype(str)
# Create age column by extracting from metadata
dreams["age"] = dreams["metadata"].str.extract(r"^([12][8901234])/female$")[0].astype(int)
# Create constant sex column
dreams["sex"] = "female"

In [8]:

Copied!

dreams = dreams.rename(columns={"dream_text": "report"})
dreams = dreams.reindex(columns=["author", "age", "sex", "report"])
dreams = dreams.sort_values("author")
dreams = dreams.rename(columns={"dream_text": "report"})
dreams = dreams.reindex(columns=["author", "age", "sex", "report"])
dreams = dreams.sort_values("author")

In [9]:

Copied!

dreams.head()
dreams.head()

Out[9]:

	author	age	sex	report
37168	1	18	female	I had this dream at the beginning of the schoo...
37177	10	19	female	It was my first day of class. It was a small c...
37178	12	19	female	I am at a place with buildings surroundings, o...
37179	13	19	female	I was watching TV with my boyfriend. I remembe...
37180	14	19	female	I was in a concert hall of my favorite singer,...

In [10]:

Copied!

dreams.describe(include="number")
dreams.describe(include="number")

Out[10]:

	age
count	81.000000
mean	20.617284
std	1.280311
min	18.000000
25%	20.000000
50%	21.000000
75%	21.000000
max	24.000000

In [11]:

Copied!

dreams.describe(exclude="number")
dreams.describe(exclude="number")

Out[11]:

	author	sex	report
count	81	81	81
unique	80	1	81
top	16	female	I had this dream at the beginning of the schoo...
freq	2	81	1

In [12]:

Copied!

dreams.info(memory_usage="deep")
dreams.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 81 entries, 37168 to 37176
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   author  81 non-null     object
 1   age     81 non-null     int64 
 2   sex     81 non-null     object
 3   report  81 non-null     object
dtypes: int64(1), object(3)
memory usage: 67.3 KB

Export¶

In [13]:

Copied!





assert not dreams.isna().any(axis=None)
assert not dreams.duplicated().any()
assert not dreams.duplicated(subset="report").any()
assert dreams["sex"].eq("female").all()
assert dreams["age"].between(18, 24).all()
assert (dreams["report"] == dreams["report"].str.strip()).all()
assert not dreams.isna().any(axis=None)
assert not dreams.duplicated().any()
assert not dreams.duplicated(subset="report").any()
assert dreams["sex"].eq("female").all()
assert dreams["age"].between(18, 24).all()
assert (dreams["report"] == dreams["report"].str.strip()).all()

In [14]:

Copied!





outpath = "./output/ucsc1996.csv"
os.makedirs(os.path.dirname(outpath), exist_ok=True)

TO_CSV_KWARGS = {
    "index": False,
    "na_rep": "N/A",
    "sep": ",",
    "mode": "x",  # Switch to `w` to overwrite existing file
    "encoding": "utf-8-sig",  # Include sig/BOM for better compatibility with Excel
    "lineterminator": "\n",
    "quoting": 2,  # 2 = csv.QUOTE_NONNUMERIC
    "quotechar": '"',
    "doublequote": True,
}
dreams.to_csv(outpath, **TO_CSV_KWARGS)

print(f"file: {os.path.basename(outpath)}")
print(f"size: {os.path.getsize(outpath) / 1e6} MB")
print(f"md5: {pooch.file_hash(outpath, alg='md5')}")
print(f"sha256: {pooch.file_hash(outpath, alg='sha256')}")
print(f"timestamp: {datetime.fromtimestamp(os.path.getmtime(outpath), tz=timezone.utc).isoformat(timespec='seconds')}")
outpath = "./output/ucsc1996.csv"
os.makedirs(os.path.dirname(outpath), exist_ok=True)

TO_CSV_KWARGS = {
    "index": False,
    "na_rep": "N/A",
    "sep": ",",
    "mode": "x",  # Switch to `w` to overwrite existing file
    "encoding": "utf-8-sig",  # Include sig/BOM for better compatibility with Excel
    "lineterminator": "\n",
    "quoting": 2,  # 2 = csv.QUOTE_NONNUMERIC
    "quotechar": '"',
    "doublequote": True,
}
dreams.to_csv(outpath, **TO_CSV_KWARGS)

print(f"file: {os.path.basename(outpath)}")
print(f"size: {os.path.getsize(outpath) / 1e6} MB")
print(f"md5: {pooch.file_hash(outpath, alg='md5')}")
print(f"sha256: {pooch.file_hash(outpath, alg='sha256')}")
print(f"timestamp: {datetime.fromtimestamp(os.path.getmtime(outpath), tz=timezone.utc).isoformat(timespec='seconds')}")

file: ucsc1996.csv
size: 0.056766 MB
md5: caf94dc6bac916330caeb200e60c5671
sha256: cc10c7e80e99039928bfe29bbb825dd1b3737cc994dcf9eba4650e1b1daa8c15
timestamp: 2025-12-30T18:35:09+00:00