Guides

Use-case recipes

Step-by-step recipes for common collection and discovery tasks, each with a runnable Python script (only the requests library is required). These build on the core workflows; for cross-cutting rules (pagination, dates, PascalCase responses) see the API conventions.

Shared setup

Every recipe assumes these helpers and your trial credentials. Two rules to keep in mind: discovering live sources (mailboxes, OneDrive accounts) uses the /users endpoint on the source location, while browsing collected (collected) data uses the /items endpoint on the BLOB collection location.

import requests, time

BASE = "https://datatap-dev-api.azurewebsites.net"   # your environment's API base URL
TENANT_ID = "<your-tenant-id>"
API_KEY   = "<your-tenant-api-key>"                  # sent as the X-API-Key header

def get(path, params=None):
    r = requests.get(f"{BASE}{path}", headers={"X-API-Key": API_KEY}, params=params, timeout=60)
    r.raise_for_status()
    return r.json() if r.content else None

def post(path, json=None):
    r = requests.post(f"{BASE}{path}", headers={"X-API-Key": API_KEY}, json=json, timeout=60)
    r.raise_for_status()
    return r.json() if r.content else None

# Resolve the seeded locations once: Exchange + OneDrive sources and the BLOB "Backup" destination.
LOCATIONS = get(f"/api/{TENANT_ID}/locations")
EXCHANGE_LOC = next(l["Id"] for l in LOCATIONS if l["Type"] == "EXCHANGE")
ONEDRIVE_LOC = next(l["Id"] for l in LOCATIONS if l["Type"] == "ONEDRIVE")
BLOB_LOC     = next(l["Id"] for l in LOCATIONS if l["Type"] == "BLOB")

1 · Discover all the mailboxes

List every user and shared mailbox in your Exchange source, following the Options.Marker continuation token until it is empty.

# Discovery's /users endpoint on a SOURCE location reads live Microsoft Graph. filter.type=
# USERS_AND_SHAREDMAILBOXES adds shared/resource mailboxes; omit it for user mailboxes only.
def discover_sources(location_id, include_shared=True):
    sources, marker = [], None
    while True:
        params = {"PageSize": 100}
        if include_shared:
            params["filter.type"] = "USERS_AND_SHAREDMAILBOXES"
        if marker:
            params["Marker"] = marker
        page = get(f"/api/{TENANT_ID}/discovery/{location_id}/users", params=params)
        sources.extend(page["Sources"])               # each: {Id, Name, Email, Type}
        marker = page["Options"].get("Marker")
        if not marker:
            break
    return sources

mailboxes = discover_sources(EXCHANGE_LOC)
print(f"{len(mailboxes)} mailboxes")
for m in mailboxes:
    print(m["Email"], "—", m["Name"], f"({m['Type']})")

2 · Discover all the OneDrive accounts

The same paginated discovery call against the OneDrive source location returns every drive owner.

# Same paginated /users call, but against the OneDrive SOURCE location: it returns the drive
# owners (users with a provisioned OneDrive). Shared mailboxes don't apply, so leave the filter off.
accounts = discover_sources(ONEDRIVE_LOC, include_shared=False)
print(f"{len(accounts)} OneDrive accounts")
for a in accounts:
    print(a["Email"], "—", a["Name"])

3 · Collect emails & notes from two user accounts

Create a BACKUP job whose policy includes only the two chosen mailboxes. An Exchange collection collects their mail and notes; scope further with folder includes (IndexFromPaths) if you need only specific folders. Indexing for search is a premium feature, so the job runs with indexFiles: false.

# Scope a collection to two specific mailboxes via the policy's Users include-list. An Exchange
# collection collects each user's mail (all folders) and notes. (It also captures calendar items; to
# restrict to specific folders, add IndexFromPaths includes.)
mboxes = {m["Email"].lower(): m for m in discover_sources(EXCHANGE_LOC)}
picked = [mboxes["alice@contoso.com"], mboxes["bob@contoso.com"]]

job = post(f"/api/{TENANT_ID}/jobs", json={
    "name": "Emails & notes — Alice + Bob",
    "description": "Collection of two mailboxes",
    "priority": "Medium",
    "type": "BACKUP",
    "schedule": {"type": "NOW"},
    "sourceId": EXCHANGE_LOC,
    "destinationId": BLOB_LOC,
    "indexFiles": False,                               # indexing is a premium (paid) feature
    "owner": "me@contoso.com",
    "policy": {
        "$type": "exchange-backup",                   # polymorphic discriminator
        "users": [                                     # INCLUDE only these two mailboxes
            {"item": {"id": u["Id"], "name": u["Name"], "email": u["Email"], "type": u["Type"]},
             "type": "INCLUDE"}
            for u in picked
        ]
    }
})
JOB_ID = job["Id"]

task = post(f"/api/{TENANT_ID}/jobs/{JOB_ID}/start")
TASK_ID = task["TaskId"]
while True:                                            # STARTING -> RUNNING -> READY | FAILED
    st = get(f"/api/{TENANT_ID}/jobs/{JOB_ID}/status/{TASK_ID}")
    print("state:", st["State"])
    if st["State"] in ("READY", "FAILED"):
        break
    time.sleep(10)

4 · List collected Sent Items for a date range

Browse the collected mailbox on the BLOB location: find its Sent Items folder, then page its messages between two timestamps. The mailbox must have been collected first (recipe 3). Each result carries the ItemId + FragmentId you need to download a single message as .eml.

# "Collected" data is browsed on the BLOB collection location (not the live source). First list the
# mailbox's folders to find Sent Items, then list its MessageItems within the date range.
SOURCE = "alice@contoso.com"                           # the collected mailbox (its owner id/email)

folders = get(f"/api/{TENANT_ID}/discovery/{BLOB_LOC}/source/{SOURCE}/items/-",
              params={"types": "MailboxFolder", "PageSize": 100})["Items"]
sent = next(f for f in folders
            if f.get("Discriminator") == "SentMailboxFolder" or f["Name"] == "Sent Items")

# Listing MessageItems requires a full-timestamp date range. Each item carries an ItemId + FragmentId
# (use them with the /download endpoint to pull a single message as .eml).
def list_messages(folder_id, from_date, to_date):
    items, marker = [], None
    while True:
        params = {"types": "MessageItem", "fromDate": from_date, "toDate": to_date, "PageSize": 100}
        if marker:
            params["Marker"] = marker
        page = get(f"/api/{TENANT_ID}/discovery/{BLOB_LOC}/source/{SOURCE}/items/{folder_id}",
                   params=params)
        items.extend(page["Items"])
        marker = page["Options"].get("Marker")
        if not marker:
            break
    return items

sent_items = list_messages(sent["ItemId"], "2024-01-01T00:00:00Z", "2024-03-31T23:59:59Z")
print(f"{len(sent_items)} sent items in range")
for it in sent_items:
    print(it["CreatedDate"], it["Name"], "fragment:", it["FragmentId"])

5 · Collect all files from a user OneDrive

Collect one user's entire drive with a OneDrive collection scoped to that drive owner.

# Collect a single user's entire OneDrive: a OneDrive collection scoped to one drive owner.
owner = next(a for a in discover_sources(ONEDRIVE_LOC, include_shared=False)
             if a["Email"].lower() == "alice@contoso.com")

job = post(f"/api/{TENANT_ID}/jobs", json={
    "name": "OneDrive collection — Alice",
    "description": "Full drive collection",
    "priority": "Medium",
    "type": "BACKUP",
    "schedule": {"type": "NOW"},
    "sourceId": ONEDRIVE_LOC,
    "destinationId": BLOB_LOC,
    "indexFiles": False,
    "owner": "me@contoso.com",
    "policy": {
        "$type": "onedrive-backup",
        "users": [{"item": {"id": owner["Id"], "name": owner["Name"],
                            "email": owner["Email"], "type": owner["Type"]},
                   "type": "INCLUDE"}]
    }
})
post(f"/api/{TENANT_ID}/jobs/{job['Id']}/start")

6 · List & download the PDF files from a user drive

Walk the collected drive and keep the .pdf files (filter on the file name — there is no server-side extension filter). The item /download endpoint serves emails only, so to retrieve the actual file bytes run a RESTORE job scoped to PDFs, which writes them back into the user's OneDrive.

# 1 · LIST — walk the collected drive (BLOB location) and keep the .pdf files. Browsing returns
#     OneDriveFile / OneDriveFolder nodes; recurse folders by their ItemId. There is no server-side
#     extension filter, so match on the file Name.
SOURCE = "alice@contoso.com"

def list_children(node_id, types):
    items, marker = [], None
    while True:
        params = {"types": types, "PageSize": 100}
        if marker:
            params["Marker"] = marker
        page = get(f"/api/{TENANT_ID}/discovery/{BLOB_LOC}/source/{SOURCE}/items/{node_id}",
                   params=params)
        items.extend(page["Items"])
        marker = page["Options"].get("Marker")
        if not marker:
            break
    return items

def walk_pdfs(node_id="-"):
    pdfs = [f for f in list_children(node_id, "OneDriveFile")
            if f["Name"].lower().endswith(".pdf")]
    for folder in list_children(node_id, "OneDriveFolder"):
        pdfs += walk_pdfs(folder["ItemId"])
    return pdfs

pdfs = walk_pdfs()
print(f"{len(pdfs)} PDFs:", [p["ReadablePath"] for p in pdfs])

# 2 · DOWNLOAD — the /download endpoint currently serves emails only (dataType=email). To get the
#     actual PDF bytes back, run a RESTORE job scoped to PDFs; it writes them into the user's drive.
restore = post(f"/api/{TENANT_ID}/jobs", json={
    "name": "Restore PDFs — Alice",
    "description": "Restore .pdf files to OneDrive",
    "priority": "High",
    "type": "RESTORE",
    "schedule": {"type": "NOW"},
    "sourceId": BLOB_LOC,                              # restore FROM the collection
    "destinationId": ONEDRIVE_LOC,                     # …back INTO OneDrive
    "indexFiles": False,
    "owner": "me@contoso.com",
    "policy": {
        "$type": "onedrive-restore",
        "users": [{"item": {"id": "<alice-id>", "name": "Alice",
                            "email": "alice@contoso.com", "type": "OneDriveUser"}, "type": "INCLUDE"}],
        "extensions": [{"item": "PDF", "type": "INCLUDE"}]   # only .pdf files
    }
})
post(f"/api/{TENANT_ID}/jobs/{restore['Id']}/start")