home..
pandas.merge
November 2021 (1157 Words, 7 Minutes)
pd.merge
에서 array의 크기가 클수록 리소스를 비례적으로 잡아먹는다.
아래를 보면 pd.merge
내부적으로 numpy.concat을 실행한다.
# pd.merge -> _MergeOperation(..).get_result()
def get_result(self):
if self.indicator:
self.left, self.right = self._indicator_pre_merge(self.left, self.right)
join_index, left_indexer, right_indexer = self._get_join_info()
llabels, rlabels = _items_overlap_with_suffix(
self.left._info_axis, self.right._info_axis, self.suffixes
)
lindexers = {1: left_indexer} if left_indexer is not None else {}
rindexers = {1: right_indexer} if right_indexer is not None else {}
result_data = concatenate_block_managers( # concat을 먼저 하고
[(self.left._mgr, lindexers), (self.right._mgr, rindexers)],
axes=[llabels.append(rlabels), join_index],
concat_axis=0,
copy=self.copy,
)
typ = self.left._constructor
result = typ(result_data).__finalize__(self, method=self._merge_type)
if self.indicator:
result = self._indicator_post_merge(result)
self._maybe_add_join_keys(result, left_indexer, right_indexer)
self._maybe_restore_index_levels(result)
self._maybe_drop_cross_column(result, self._cross)
return result.__finalize__(self, method="merge")
def concatenate_block_managers(
mgrs_indexers, axes, concat_axis: int, copy: bool
) -> BlockManager:
"""
Concatenate block managers into one.
Parameters
----------
mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples
axes : list of Index
concat_axis : int
copy : bool
Returns
-------
BlockManager
"""
concat_plans = [
_get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers
]
concat_plan = _combine_concat_plans(concat_plans, concat_axis)
blocks = []
for placement, join_units in concat_plan:
if len(join_units) == 1 and not join_units[0].indexers:
b = join_units[0].block
values = b.values
if copy:
values = values.copy()
else:
values = values.view()
b = b.make_block_same_class(values, placement=placement)
elif _is_uniform_join_units(join_units):
blk = join_units[0].block
vals = [ju.block.values for ju in join_units]
if not blk.is_extension:
values = concat_compat(vals, axis=blk.ndim - 1)
else:
# TODO(EA2D): special-casing not needed with 2D EAs
values = concat_compat(vals)
if not isinstance(values, ExtensionArray):
values = values.reshape(1, len(values))
b = make_block(values, placement=placement, ndim=blk.ndim)
else:
b = make_block(
_concatenate_join_units(join_units, concat_axis, copy=copy),
placement=placement,
ndim=len(axes),
)
blocks.append(b)
return BlockManager(blocks, axes)
def concat_compat(to_concat, axis: int = 0):
"""
provide concatenation of an array of arrays each of which is a single
'normalized' dtypes (in that for example, if it's object, then it is a
non-datetimelike and provide a combined dtype for the resulting array that
preserves the overall dtype if possible)
Parameters
----------
to_concat : array of arrays
axis : axis to provide concatenation
Returns
-------
a single array, preserving the combined dtypes
"""
# filter empty arrays
# 1-d dtypes always are included here
def is_nonempty(x) -> bool:
if x.ndim <= axis:
return True
return x.shape[axis] > 0
# If all arrays are empty, there's nothing to convert, just short-cut to
# the concatenation, #3121.
#
# Creating an empty array directly is tempting, but the winnings would be
# marginal given that it would still require shape & dtype calculation and
# np.concatenate which has them both implemented is compiled.
non_empties = [x for x in to_concat if is_nonempty(x)]
if non_empties and axis == 0:
to_concat = non_empties
typs = _get_dtype_kinds(to_concat)
_contains_datetime = any(typ.startswith("datetime") for typ in typs)
all_empty = not len(non_empties)
single_dtype = len({x.dtype for x in to_concat}) == 1
any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat)
if any_ea:
# we ignore axis here, as internally concatting with EAs is always
# for axis=0
if not single_dtype:
target_dtype = find_common_type([x.dtype for x in to_concat])
to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat]
if isinstance(to_concat[0], ExtensionArray):
cls = type(to_concat[0])
return cls._concat_same_type(to_concat)
else:
return np.concatenate(to_concat)
elif _contains_datetime or "timedelta" in typs:
return _concat_datetime(to_concat, axis=axis)
elif all_empty:
# we have all empties, but may need to coerce the result dtype to
# object if we have non-numeric type operands (numpy would otherwise
# cast this to float)
typs = _get_dtype_kinds(to_concat)
if len(typs) != 1:
if not len(typs - {"i", "u", "f"}) or not len(typs - {"bool", "i", "u"}):
# let numpy coerce
pass
else:
# coerce to object
to_concat = [x.astype("object") for x in to_concat]
return np.concatenate(to_concat, axis=axis)