Source code for qiskit_machine_learning.datasets.ad_hoc

# This code is part of a Qiskit project.
#
# (C) Copyright IBM 2018, 2025.
#
# This code is licensed under the Apache License, Version 2.0. You may
# obtain a copy of this license in the LICENSE.txt file in the root directory
# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
#
# Any modifications or derivative works of this code must retain this
# copyright notice, and modified files need to carry a notice indicating
# that they have been altered from the originals.

"""
Ad Hoc Dataset
"""
from __future__ import annotations

import warnings
import itertools as it

import numpy as np
from scipy.stats.qmc import Sobol
from qiskit.utils import optionals

from ..utils import algorithm_globals


# pylint: disable=too-many-positional-arguments
[docs] def ad_hoc_data( training_size: int, test_size: int, n: int, gap: int = 0, plot_data: bool = False, one_hot: bool = True, include_sample_total: bool = False, entanglement: str = "full", sampling_method: str = "grid", divisions: int = 0, labelling_method: str = "expectation", class_labels: list | None = None, ) -> ( tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray] | tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray] ): r""" Generates a dataset that can be fully separated by :class:`~qiskit.circuit.library.ZZFeatureMap` according to the procedure outlined in [1]. First, vectors :math:`\vec{x} \in (0, 2\pi]^{n}` are generated from a uniform distribution, using a sampling method determined by the ``sampling_method`` argument. Next, a feature map is applied: .. math:: |\Phi(\vec{x})\rangle = U_{\Phi(\vec{x})} \, H^{\otimes n} \, U_{\Phi(\vec{x})} \, H^{\otimes n} \, |0^{\otimes n}\rangle where .. math:: U_{\Phi(\vec{x})} = \exp\Bigl(i \sum_{S \subseteq [n]} \phi_S(\vec{x}) \prod_{i \in S} Z_i\Bigr), and .. math:: \begin{cases}\phi_{\{i, j\}} = (\pi - x_i)(\pi - x_j) \\ \phi_{\{i\}} = x_i \end{cases} The choice of second-order terms :math:`Z_i Z_j` in the above summation depends on the ``entanglement`` argument (``"linear"``, ``"circular"``, or ``"full"``). See arguments for more information. An observable is then defined as .. math:: O = V^\dagger \bigl(\prod_i Z_i\bigr) V where :math:`V` is a randomly generated unitary matrix. Depending on the ``labelling_method``, if ``"expectation"`` is used, the expectation value :math:`\langle \Phi(\vec{x})| O |\Phi(\vec{x})\rangle` is compared to the gap parameter :math:`\Delta` (from ``gap``) to assign :math:`\pm 1` labels. if ``"measurement"`` is used, a simple measurement in the computational basis is performed to assign labels. **References:** [1] Havlíček V, Córcoles AD, Temme K, Harrow AW, Kandala A, Chow JM, Gambetta JM. *Supervised learning with quantum-enhanced feature spaces*. Nature. 2019 Mar;567(7747):209–212. `arXiv:1804.11326 <https://arxiv.org/abs/1804.11326>`_ Parameters: training_size : Number of training samples per class. test_size : Number of testing samples per class. n : Number of qubits (dimension of the feature space). gap : Separation gap :math:`\Delta` used when ``labelling_method="expectation"``. Default is 0. plot_data : If True, plots the sampled data (disabled automatically if ``n > 3``). Default is False. one_hot : If True, returns labels in one-hot format. Default is True. include_sample_total : If True, the function also returns the total number of accepted samples. Default is False. entanglement : Determines which second-order terms :math:`Z_i Z_j` appear in :math:`U_{\Phi(\vec{x})}`. The options are: * ``"linear"``: Includes terms :math:`Z_i Z_{i+1}`. * ``"circular"``: Includes ``"linear"`` terms plus :math:`Z_{n-1}Z_0`. * ``"full"``: Includes all pairwise terms :math:`Z_i Z_j`. Default is ``"full"``. sampling_method: The method used to generate uniform samples :math:`\vec{x}`. Choices are: * ``"grid"``: Chooses points from a uniform grid (supported only if ``n <= 3``) * ``"hypercube"``: Uses a variant of Latin Hypercube sampling for stratification * ``"sobol"``: Uses Sobol sequences Default is ``"grid"``. divisions : Must be specified if ``sampling_method="hypercube"``. This parameter determines the number of stratifications along each dimension. Recommended to be chosen close to ``training_size``. labelling_method : Method for assigning labels. The options are: * ``"expectation"``: Uses the expectation value of the observable. * ``"measurement"``: Performs a measurement in the computational basis. Default is ``"expectation"``. class_labels : Custom labels for the two classes when one-hot is not enabled. If not provided, the labels default to ``-1`` and ``+1`` Returns: Tuple containing the following: * **training_features** : ``np.ndarray`` * **training_labels** : ``np.ndarray`` * **testing_features** : ``np.ndarray`` * **testing_labels** : ``np.ndarray`` If ``include_sample_total=True``, a fifth element (``np.ndarray``) is included that specifies the total number of accepted samples. """ # Default Value if class_labels is None: class_labels = [0, 1] # Errors if training_size < 0: raise ValueError("Training size can't be less than 0") if test_size < 0: raise ValueError("Test size can't be less than 0") if n <= 0: raise ValueError("Number of qubits can't be less than 1") if gap < 0 and labelling_method == "expectation": raise ValueError("Gap can't be less than 0") if entanglement not in {"linear", "circular", "full"}: raise ValueError("Invalid entanglement type. Must be 'linear', 'circular', or 'full'.") if sampling_method not in {"grid", "hypercube", "sobol"}: raise ValueError("Invalid sampling method. Must be 'grid', 'hypercube', or 'sobol'.") if divisions == 0 and sampling_method == "hypercube": raise ValueError("Divisions must be set for 'hypercube' sampling.") if labelling_method not in {"expectation", "measurement"}: raise ValueError("Invalid labelling method. Must be 'expectation' or 'measurement'.") if n > 3 and sampling_method == "grid": raise ValueError("Grid sampling is unsupported for n > 3.") # Warnings if n > 3 and plot_data: warnings.warn( "Plotting for n > 3 is unsupported. Disabling plot_data.", UserWarning, ) plot_data = False if sampling_method == "grid" and (training_size + test_size) > 4000: warnings.warn( """Grid Sampling for large number of samples is not recommended and can lead to samples repeating in the training and testing sets""", UserWarning, ) # Initial State dims = 2**n psi_0 = np.ones(dims) / np.sqrt(dims) # n-qubit Hadamard h_n = _n_hadamard(n) # Single qubit Z gates z_diags = np.array([np.diag(_i_z(i, n)).reshape((1, -1)) for i in range(n)]) # Precompute ZZ Entanglements zz_diags = {} if entanglement == "full": for i, j in it.combinations(range(n), 2): zz_diags[(i, j)] = z_diags[i] * z_diags[j] else: for i in range(n - 1): zz_diags[(i, i + 1)] = z_diags[i] * z_diags[i + 1] if entanglement == "circular": zz_diags[(n - 1, 0)] = z_diags[n - 1] * z_diags[0] # n-qubit Z gate: notice that h_n[0,:] has the same elements as diagonal of z_n z_n = _n_z(h_n) # V change of basis: Eigenbasis of a random hermitian will be a random unitary v = _random_unitary(dims) # Observable for labelling boundary mat_o = v.conj().T @ z_n @ v n_samples = training_size + test_size # Labelling Methods if labelling_method == "expectation": def _lab_fn(psi_state): return _exp_label(psi_state, gap, mat_o) else: eig = np.linalg.eigh(mat_o) def _lab_fn(psi_state): return _measure(psi_state, eig) # Sampling Methods if sampling_method == "grid": a_features, b_features = _grid_sampling( n, n_samples, z_diags, zz_diags, psi_0, h_n, _lab_fn ) else: if sampling_method == "hypercube": def _samp_fn(a, b): return _modified_lhc(a, b, divisions) else: def _samp_fn(a, b): return _sobol_sampling(a, b) a_features, b_features = _loop_sampling( n, n_samples, z_diags, zz_diags, psi_0, h_n, _lab_fn, _samp_fn, sampling_method, ) if plot_data: _plot_ad_hoc_data(a_features, b_features, training_size) x_train = np.concatenate((a_features[:training_size], b_features[:training_size]), axis=0) x_test = np.concatenate((a_features[training_size:], b_features[training_size:]), axis=0) if one_hot: y_train = np.array([[1, 0]] * training_size + [[0, 1]] * training_size) y_test = np.array([[1, 0]] * test_size + [[0, 1]] * test_size) else: y_train = np.array([class_labels[0]] * training_size + [class_labels[1]] * training_size) y_test = np.array([class_labels[0]] * test_size + [class_labels[1]] * test_size) if include_sample_total: samples = np.array([n_samples * 2]) return (x_train, y_train, x_test, y_test, samples) return (x_train, y_train, x_test, y_test)
@optionals.HAS_MATPLOTLIB.require_in_call def _plot_ad_hoc_data(a_features: np.ndarray, b_features: np.ndarray, training_size: int) -> None: """Plot the ad hoc dataset. Args: a_features (np.ndarray): Class-A feature vectors. b_features (np.ndarray): Class-B feature vectors. training_size (int): Number of training samples to plot. """ import matplotlib.pyplot as plt fig = plt.figure() projection = "3d" if a_features.shape[1] == 3 else None ax1 = fig.add_subplot(1, 1, 1, projection=projection) ax1.scatter(*a_features[:training_size].T) ax1.scatter(*b_features[:training_size].T) ax1.set_title("Ad-hoc Data") plt.show() def _n_hadamard(n: int) -> np.ndarray: """Generate an n-qubit Hadamard matrix. Args: n (int): Number of qubits. Returns: np.ndarray: The n-qubit Hadamard matrix. """ base = np.array([[1, 1], [1, -1]]) / np.sqrt(2) result = np.eye(1) expo = n while expo > 0: if expo % 2 == 1: result = np.kron(result, base) base = np.kron(base, base) expo //= 2 return result def _i_z(i: int, n: int) -> np.ndarray: """Create the i-th single-qubit Z gate in an n-qubit system. Args: i (int): Index of the qubit. n (int): Total number of qubits. Returns: np.ndarray: The Z gate acting on the i-th qubit. """ z = np.diag([1, -1]) i_1 = np.eye(2**i) i_2 = np.eye(2 ** (n - i - 1)) result = np.kron(i_1, z) result = np.kron(result, i_2) return result def _n_z(h_n: np.ndarray) -> np.ndarray: """Generate an n-qubit Z gate from the n-qubit Hadamard matrix. Args: h_n (np.ndarray): n-qubit Hadamard matrix. Returns: np.ndarray: The n-qubit Z gate. """ res = np.diag(h_n) res = np.sign(res) res = np.diag(res) return res def _modified_lhc(n: int, n_samples: int, n_div: int) -> np.ndarray: """Generate samples using modified Latin Hypercube Sampling. Args: n (int): Dimensionality of the data. n_samples (int): Number of samples to generate. n_div (int): Number of divisions for stratified sampling. Returns: np.ndarray: Generated samples. """ samples = np.empty((n_samples, n), dtype=float) bin_size = 2 * np.pi / n_div n_passes = (n_samples + n_div - 1) // n_div all_bins = np.tile(np.arange(n_div), n_passes) for dim in range(n): algorithm_globals.random.shuffle(all_bins) chosen_bins = all_bins[:n_samples] offsets = algorithm_globals.random.random(n_samples) samples[:, dim] = (chosen_bins + offsets) * bin_size return samples def _sobol_sampling(n: int, n_samples: int) -> np.ndarray: """Generate samples using Sobol sequence sampling. Args: n (int): Dimensionality of the data. n_samples (int): Number of samples to generate. Returns: np.ndarray: Generated samples scaled to the interval [0, 2π]. """ sampler = Sobol(d=n, scramble=True) p = 2 * np.pi * sampler.random(n_samples) return p def _phi_i(x_vecs: np.ndarray, i: int) -> np.ndarray: """Compute the φ_i term for a given dimension. Args: x_vecs (np.ndarray): Input sample vectors. i (int): Dimension index. Returns: np.ndarray: Computed φ_i values. """ return x_vecs[:, i].reshape((-1, 1)) def _phi_ij(x_vecs: np.ndarray, i: int, j: int) -> np.ndarray: """Compute the φ_ij term for given dimensions. Args: x_vecs (np.ndarray): Input sample vectors. i (int): First dimension index. j (int): Second dimension index. Returns: np.ndarray: Computed φ_ij values. """ return ((np.pi - x_vecs[:, i]) * (np.pi - x_vecs[:, j])).reshape((-1, 1)) def _random_unitary(dims): a = np.array( algorithm_globals.random.random((dims, dims)) + 1j * algorithm_globals.random.random((dims, dims)) ) herm = a.conj().T @ a eigvals, eigvecs = np.linalg.eig(herm) idx = eigvals.argsort()[::-1] v = eigvecs[:, idx] return v def _loop_sampling(n, n_samples, z_diags, zz_diags, psi_0, h_n, lab_fn, samp_fn, sampling_method): """ Loop-based sampling routine to allocate feature vectors into two classes. Args: n (int): Number of qubits (feature dimension). n_samples (int): Number of samples needed per class. z_diags (np.ndarray): Array of single-qubit Z diagonal elements. zz_diags (dict): dictionary of ZZ-diagonal elements keyed by qubit pairs. O (np.ndarray): Observable for label determination. psi_0 (np.ndarray): Initial state vector. h_n (np.ndarray): n-qubit Hadamard matrix. lab_fn (Callable): Labeling function (either expectation-based or measurement-based). samp_fn (Callable): Sampling function that generates feature vectors. sampling_method (str): String indicating which sampling method is used ("grid", "hypercube", or "sobol"). Returns: tuple[np.ndarray, np.ndarray]: Two arrays of shape `(n_samples, n)`, each containing the sampled feature vectors belonging to class A and class B, respectively. """ a_features = np.empty((n_samples, n), dtype=float) b_features = np.empty((n_samples, n), dtype=float) dims = 2**n a_cur, b_cur = 0, 0 a_needed, b_needed = n_samples, n_samples while a_needed > 0 or b_needed > 0: n_pass = a_needed + b_needed # Sobol works better with a 2^n just above n_pass if sampling_method == "sobol": n_pass = 2 ** ((n_pass - 1).bit_length()) # Stratified Sampling for x vector x_vecs = samp_fn(n, n_pass) pre_exp = np.zeros((n_pass, dims)) # First Order Terms for i in range(n): pre_exp += _phi_i(x_vecs, i) * z_diags[i] # Second Order Terms for i, j in zz_diags.keys(): pre_exp += _phi_ij(x_vecs, i, j) * zz_diags[(i, j)] # Since pre_exp is purely diagonal, exp(A) = diag(exp(Aii)) post_exp = np.exp(1j * pre_exp) uphi = np.zeros((n_pass, dims, dims), dtype=post_exp.dtype) cols = range(dims) uphi[:, cols, cols] = post_exp[:, cols] psi = (uphi @ h_n @ uphi @ psi_0).reshape((-1, dims, 1)) # Labelling raw_labels = lab_fn(psi) if a_needed > 0: a_indx = raw_labels == 1 a_count = min(int(np.sum(a_indx)), a_needed) a_features[a_cur : a_cur + a_count] = x_vecs[a_indx][:a_count] a_cur += a_count a_needed -= a_count if b_needed > 0: b_indx = raw_labels == -1 b_count = min(int(np.sum(b_indx)), b_needed) b_features[b_cur : b_cur + b_count] = x_vecs[b_indx][:b_count] b_cur += b_count b_needed -= b_count return a_features, b_features def _exp_label(psi, gap, mat_o): """ Compute labels by comparing the expectation value of an observable to a gap. Args: psi (np.ndarray): Array of shape `(num_samples, dim, 1)` containing the statevectors for each sample. gap (float): Separation gap (Δ). If the absolute expectation exceeds this, the sample is labeled ±1 based on the sign. O (np.ndarray): Observable used for label determination. Returns: np.ndarray: Labels for each sample. Values will be -1, 0, or +1, where 0 indicates an expectation value within the gap zone (not exceeding ±gap). """ psi_dag = np.transpose(psi.conj(), (0, 2, 1)) exp_val = np.real(psi_dag @ mat_o @ psi).flatten() labels = (np.abs(exp_val) > gap) * (np.sign(exp_val)) return labels def _measure(psi, eig): """ Compute labels by simulating a measurement of the observable on each state. The eigen-decomposition of O is used as the measurement basis. Each state is projected onto one of the eigenvectors, and labels are set to the corresponding eigenvalue. Args: psi (np.ndarray): Array of shape `(num_samples, dim, 1)` containing the statevectors for each sample. eig (np.ndarray): Eigenvalues of Observable to be 'measured' Returns: np.ndarray: Labels for each sample, set to one of the eigenvalues of the observable O. """ eigenvalues, eigenvectors = eig eigshape = eigenvectors.shape new_psi = eigenvectors.T.conj().reshape((1, eigshape[1], eigshape[0])) @ psi probab = np.abs(new_psi) ** 2 toss = algorithm_globals.random.random((psi.shape[0], 1)) cum_probab = np.cumsum(probab, axis=1).reshape(psi.shape[0], -1) collapsed = (cum_probab >= toss).argmax(axis=-1, keepdims=True) labels = eigenvalues[collapsed.flatten()] return np.sign(labels) def _grid_sampling(n, n_samples, z_diags, zz_diags, psi_0, h_n, lab_fn): """ Generate feature vectors from a uniform grid (only supported for `n <= 3`) and assign labels using the specified labeling function. Args: n (int): Number of qubits (dimension). n_samples (int): Number of samples needed per class. z_diags (np.ndarray): Array of single-qubit Z diagonal elements. zz_diags (dict): dictionary of ZZ-diagonal elements keyed by qubit pairs. psi_0 (np.ndarray): Initial state vector. h_n (np.ndarray): n-qubit Hadamard matrix. lab_fn (Callable): Labeling function (either expectation-based or measurement-based). Returns: tuple[np.ndarray, np.ndarray]: Two arrays of shape `(n_samples, n)`, each containing the sampled feature vectors belonging to class A and class B, respectively. This code is incomplete and references variables not defined above, so the returned arrays are empty placeholders by default. """ count = 1 if n == 1: count = 5000 elif n == 2: count = 100 elif n == 3: count = 20 xvals = np.linspace(0, 2 * np.pi, count, endpoint=False) grid_labels = [] # Loop through uniform grid for x in it.product(*[xvals] * n): x_arr = np.array(x) pre_exp = 0 for i in range(n): pre_exp += x_arr[i] * z_diags[i] for i, j in zz_diags.keys(): pre_exp += ((np.pi - x_arr[i]) * (np.pi - x_arr[j])) * zz_diags[(i, j)] uphi = np.diag(np.exp(1j * pre_exp.flatten())) psi = uphi @ h_n @ uphi @ psi_0 label = lab_fn(psi.reshape((1, -1, 1))) grid_labels.append(label) grid_labels = np.array(grid_labels).reshape(*[count] * n) count = grid_labels.shape[0] a_features, b_features = [], [] while len(a_features) < n_samples: draws = tuple(algorithm_globals.random.choice(count) for _ in range(n)) if grid_labels[draws] == 1: a_features.append([xvals[d] for d in draws]) while len(b_features) < n_samples: draws = tuple(algorithm_globals.random.choice(count) for _ in range(n)) if grid_labels[draws] == -1: b_features.append([xvals[d] for d in draws]) return np.array(a_features), np.array(b_features)