home..
pandas.merge

November 2021 (1157 Words, 7 Minutes)
pd.merge에서 array의 크기가 클수록 리소스를 비례적으로 잡아먹는다.
아래를 보면 pd.merge 내부적으로 numpy.concat을 실행한다.
# pd.merge -> _MergeOperation(..).get_result()

    def get_result(self):
        if self.indicator:
            self.left, self.right = self._indicator_pre_merge(self.left, self.right)

        join_index, left_indexer, right_indexer = self._get_join_info()

        llabels, rlabels = _items_overlap_with_suffix(
            self.left._info_axis, self.right._info_axis, self.suffixes
        )

        lindexers = {1: left_indexer} if left_indexer is not None else {}
        rindexers = {1: right_indexer} if right_indexer is not None else {}

        result_data = concatenate_block_managers( # concat을 먼저 하고
            [(self.left._mgr, lindexers), (self.right._mgr, rindexers)],
            axes=[llabels.append(rlabels), join_index],
            concat_axis=0,
            copy=self.copy,
        )

        typ = self.left._constructor
        result = typ(result_data).__finalize__(self, method=self._merge_type)

        if self.indicator:
            result = self._indicator_post_merge(result)

        self._maybe_add_join_keys(result, left_indexer, right_indexer)

        self._maybe_restore_index_levels(result)

        self._maybe_drop_cross_column(result, self._cross)

        return result.__finalize__(self, method="merge")

def concatenate_block_managers(
        mgrs_indexers, axes, concat_axis: int, copy: bool
    ) -> BlockManager:
    """
    Concatenate block managers into one.

    Parameters
    ----------
    mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples
    axes : list of Index
    concat_axis : int
    copy : bool

    Returns
    -------
    BlockManager
    """
    concat_plans = [
        _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers
    ]
    concat_plan = _combine_concat_plans(concat_plans, concat_axis)
    blocks = []

    for placement, join_units in concat_plan:

        if len(join_units) == 1 and not join_units[0].indexers:
            b = join_units[0].block
            values = b.values
            if copy:
                values = values.copy()
            else:
                values = values.view()
            b = b.make_block_same_class(values, placement=placement)
        elif _is_uniform_join_units(join_units):
            blk = join_units[0].block
            vals = [ju.block.values for ju in join_units]

            if not blk.is_extension:
                values = concat_compat(vals, axis=blk.ndim - 1)
            else:
                # TODO(EA2D): special-casing not needed with 2D EAs
                values = concat_compat(vals)
                if not isinstance(values, ExtensionArray):
                    values = values.reshape(1, len(values))

            b = make_block(values, placement=placement, ndim=blk.ndim)
        else:
            b = make_block(
                _concatenate_join_units(join_units, concat_axis, copy=copy),
                placement=placement,
                ndim=len(axes),
            )
        blocks.append(b)

    return BlockManager(blocks, axes)


def concat_compat(to_concat, axis: int = 0):
    """
    provide concatenation of an array of arrays each of which is a single
    'normalized' dtypes (in that for example, if it's object, then it is a
    non-datetimelike and provide a combined dtype for the resulting array that
    preserves the overall dtype if possible)

    Parameters
    ----------
    to_concat : array of arrays
    axis : axis to provide concatenation

    Returns
    -------
    a single array, preserving the combined dtypes
    """
    # filter empty arrays
    # 1-d dtypes always are included here
    def is_nonempty(x) -> bool:
        if x.ndim <= axis:
            return True
        return x.shape[axis] > 0

    # If all arrays are empty, there's nothing to convert, just short-cut to
    # the concatenation, #3121.
    #
    # Creating an empty array directly is tempting, but the winnings would be 
    # marginal given that it would still require shape & dtype calculation and 
    # np.concatenate which has them both implemented is compiled.
    non_empties = [x for x in to_concat if is_nonempty(x)]
    if non_empties and axis == 0:
        to_concat = non_empties

    typs = _get_dtype_kinds(to_concat)
    _contains_datetime = any(typ.startswith("datetime") for typ in typs)

    all_empty = not len(non_empties)
    single_dtype = len({x.dtype for x in to_concat}) == 1
    any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat)

    if any_ea:
        # we ignore axis here, as internally concatting with EAs is always
        # for axis=0
        if not single_dtype:
            target_dtype = find_common_type([x.dtype for x in to_concat])
            to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat]

        if isinstance(to_concat[0], ExtensionArray):
            cls = type(to_concat[0])
            return cls._concat_same_type(to_concat)
        else:
            return np.concatenate(to_concat)

    elif _contains_datetime or "timedelta" in typs:
        return _concat_datetime(to_concat, axis=axis)

    elif all_empty:
        # we have all empties, but may need to coerce the result dtype to
        # object if we have non-numeric type operands (numpy would otherwise
        # cast this to float)
        typs = _get_dtype_kinds(to_concat)
        if len(typs) != 1:

            if not len(typs - {"i", "u", "f"}) or not len(typs - {"bool", "i", "u"}):
                # let numpy coerce
                pass
            else:
                # coerce to object
                to_concat = [x.astype("object") for x in to_concat]

    return np.concatenate(to_concat, axis=axis)