urbina1975¶

This dataset comes from DreamBank. It merges the Peruvian men and Peruvian women datasets from an archived copy of DreamBank hosted on Zenodo. See the krank GitHub Issue and the urbina1975 docs page for more info.

Setup¶

In [1]:

Copied!

from datetime import datetime, timezone
import os

import pandas as pd
import pooch
from datetime import datetime, timezone
import os

import pandas as pd
import pooch

In [2]:

Copied!

DATASETS = ["peru-f", "peru-m"]
DATASETS = ["peru-f", "peru-m"]

Load¶

In [3]:

Copied!





# Retrieve the latest datasets from GitHub releases
# OR from local file if during development of a new release.
DREAMBANK_VERSION = "v1.0.0-alpha5"
datasets_kwargs = {
    "url": f"https://github.com/remrama/dreambank/releases/download/{DREAMBANK_VERSION}/datasets.csv.xz",
    "known_hash": "md5:1475582e2daa1da53920df50cb9fc98e",
}
dreams_kwargs = {
    "url": f"https://github.com/remrama/dreambank/releases/download/{DREAMBANK_VERSION}/dreams.csv.xz",
    "known_hash": "md5:2dcab92f9d9515df174388babb5c9e5a",
    "progressbar": True,
}

datasets_fname = pooch.retrieve(**datasets_kwargs)
dreams_fname = pooch.retrieve(**dreams_kwargs)
# datasets_fname = "../output/datasets.csv.xz"
# dreams_fname = "../output/dreams.csv.xz"
# Retrieve the latest datasets from GitHub releases
# OR from local file if during development of a new release.
DREAMBANK_VERSION = "v1.0.0-alpha5"
datasets_kwargs = {
    "url": f"https://github.com/remrama/dreambank/releases/download/{DREAMBANK_VERSION}/datasets.csv.xz",
    "known_hash": "md5:1475582e2daa1da53920df50cb9fc98e",
}
dreams_kwargs = {
    "url": f"https://github.com/remrama/dreambank/releases/download/{DREAMBANK_VERSION}/dreams.csv.xz",
    "known_hash": "md5:2dcab92f9d9515df174388babb5c9e5a",
    "progressbar": True,
}

datasets_fname = pooch.retrieve(**datasets_kwargs)
dreams_fname = pooch.retrieve(**dreams_kwargs)
# datasets_fname = "../output/datasets.csv.xz"
# dreams_fname = "../output/dreams.csv.xz"

In [4]:

Copied!

datasets = pd.read_csv(datasets_fname)
dreams = pd.read_csv(dreams_fname)
datasets = pd.read_csv(datasets_fname)
dreams = pd.read_csv(dreams_fname)

Process¶

In [5]:

Copied!

dreams = dreams[dreams["dataset"].isin(DATASETS)].copy()
dreams = dreams[dreams["dataset"].isin(DATASETS)].copy()

In [6]:

Copied!

assert dreams["metadata"].isna().all()
assert dreams["dream_id"].str.match(r"^[FM][0-4][0-9]-[1-8]$").all()
assert dreams["dataset"].str[-1].eq(dreams["dream_id"].str[0].str.lower()).all()
assert dreams["metadata"].isna().all()
assert dreams["dream_id"].str.match(r"^[FM][0-4][0-9]-[1-8]$").all()
assert dreams["dataset"].str[-1].eq(dreams["dream_id"].str[0].str.lower()).all()

In [7]:

Copied!

dreams["sex"] = dreams["dataset"].str[-1].map({"f": "female", "m": "male"})
dreams["author"] = dreams["dream_id"].str[:3]
dreams["sex"] = dreams["dataset"].str[-1].map({"f": "female", "m": "male"})
dreams["author"] = dreams["dream_id"].str[:3]

In [8]:

Copied!

dreams = dreams.rename(columns={"dream_text": "report"})
dreams = dreams.reindex(columns=["author", "sex", "report"])
dreams = dreams.sort_values("author")
dreams = dreams.rename(columns={"dream_text": "report"})
dreams = dreams.reindex(columns=["author", "sex", "report"])
dreams = dreams.sort_values("author")

In [9]:

Copied!

# Minor fix: See https://github.com/remrama/dreambank/issues/6
dreams["report"] = dreams["report"].str.replace(r"\\$", "", regex=True)
# Minor fix: See https://github.com/remrama/dreambank/issues/6
dreams["report"] = dreams["report"].str.replace(r"\\$", "", regex=True)

In [10]:

Copied!

dreams.head()
dreams.head()

Out[10]:

	author	sex	report
35127	F01	female	I dreamed I was a the beach with my sister, su...
35128	F01	female	I dreamed I was married to a very tall and fat...
35129	F01	female	I dreamed about an unknown person who had a sn...
35130	F01	female	I dreamed about a friend and we were going by ...
35131	F01	female	I dreamed I was at a beach and then a group of...

In [11]:

Copied!

dreams.describe()
dreams.describe()

Out[11]:

	author	sex	report
count	766	766	766
unique	96	2	766
top	F01	male	I dreamed I was a the beach with my sister, su...
freq	8	384	1

In [12]:

Copied!

dreams.info(memory_usage="deep")
dreams.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 766 entries, 35127 to 35892
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   author  766 non-null    object
 1   sex     766 non-null    object
 2   report  766 non-null    object
dtypes: object(3)
memory usage: 603.5 KB

Export¶

In [13]:

Copied!





assert not dreams.isna().any(axis=None)
assert not dreams.duplicated().any()
assert not dreams.duplicated(subset="report").any()
assert dreams["sex"].isin(["female", "male"]).all()
assert (dreams["report"] == dreams["report"].str.strip()).all()
assert not dreams.isna().any(axis=None)
assert not dreams.duplicated().any()
assert not dreams.duplicated(subset="report").any()
assert dreams["sex"].isin(["female", "male"]).all()
assert (dreams["report"] == dreams["report"].str.strip()).all()

In [14]:

Copied!





outpath = "./output/urbina1975.csv"
os.makedirs(os.path.dirname(outpath), exist_ok=True)

TO_CSV_KWARGS = {
    "index": False,
    "na_rep": "N/A",
    "sep": ",",
    "mode": "x",  # Switch to `w` to overwrite existing file
    "encoding": "utf-8-sig",  # Include sig/BOM for better compatibility with Excel
    "lineterminator": "\n",
    "quoting": 2,  # 2 = csv.QUOTE_NONNUMERIC
    "quotechar": '"',
    "doublequote": True,
}
dreams.to_csv(outpath, **TO_CSV_KWARGS)

print(f"file: {os.path.basename(outpath)}")
print(f"size: {os.path.getsize(outpath) / 1e6} MB")
print(f"md5: {pooch.file_hash(outpath, alg='md5')}")
print(f"sha256: {pooch.file_hash(outpath, alg='sha256')}")
print(f"timestamp: {datetime.fromtimestamp(os.path.getmtime(outpath), tz=timezone.utc).isoformat(timespec='seconds')}")
outpath = "./output/urbina1975.csv"
os.makedirs(os.path.dirname(outpath), exist_ok=True)

TO_CSV_KWARGS = {
    "index": False,
    "na_rep": "N/A",
    "sep": ",",
    "mode": "x",  # Switch to `w` to overwrite existing file
    "encoding": "utf-8-sig",  # Include sig/BOM for better compatibility with Excel
    "lineterminator": "\n",
    "quoting": 2,  # 2 = csv.QUOTE_NONNUMERIC
    "quotechar": '"',
    "doublequote": True,
}
dreams.to_csv(outpath, **TO_CSV_KWARGS)

print(f"file: {os.path.basename(outpath)}")
print(f"size: {os.path.getsize(outpath) / 1e6} MB")
print(f"md5: {pooch.file_hash(outpath, alg='md5')}")
print(f"sha256: {pooch.file_hash(outpath, alg='sha256')}")
print(f"timestamp: {datetime.fromtimestamp(os.path.getmtime(outpath), tz=timezone.utc).isoformat(timespec='seconds')}")

file: urbina1975.csv
size: 0.506346 MB
md5: 05197362b787f8bef2e7c11fb9b81bf1
sha256: 4367e9c0b09176b920cb241c1658323eae68a7f51b5df8e4dce2fecffc343b3a
timestamp: 2025-12-30T04:11:42+00:00