Pythonで文字列をバイト列に変換する方法は？

encode()メソッドを使うのが一般的で、mystring.encode('utf-8')のように使用します。また、bytes(mystring, 'utf-8')も使えます。

encode()とbytes()の違いは？

encode()は明示的に文字列をエンコードするためのメソッドで、対となるdecode()が存在します。bytes()はコンストラクタで、複数のデータ型をバイト列に変換できます。

encode()のデフォルトのエンコーディングは？

encode()のデフォルトは'utf-8'です。

【Python】文字列をバイト列に変換する方法

Pythonで文字列（str）をバイト列（bytes）に変換する方法について解説します。ファイルの読み書きやネットワーク通信では、この変換が頻繁に必要になります。

変換方法の比較

方法	構文	推奨度	用途
`encode()`	`s.encode('utf-8')`	⭐⭐⭐	一般的な文字列変換
`bytes()`	`bytes(s, 'utf-8')`	⭐⭐	コンストラクタ形式
リテラル	`b'hello'`	⭐⭐⭐	ASCII文字列のみ

encode() メソッド（推奨）

str.encode()は文字列をバイト列に変換する最も一般的な方法です。

# 基本的な使い方
text = "Hello, World!"
bytes_data = text.encode('utf-8')

print(bytes_data)        # b'Hello, World!'
print(type(bytes_data))  # <class 'bytes'>

# デフォルトはUTF-8
bytes_data = text.encode()  # encoding='utf-8'と同じ

# 日本語の場合
japanese = "こんにちは"
bytes_jp = japanese.encode('utf-8')
print(bytes_jp)  # b'\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf'
print(len(bytes_jp))  # 15バイト（日本語1文字 = 3バイト）

エンコーディングの種類

text = "こんにちは"

# UTF-8（推奨）
utf8 = text.encode('utf-8')
print(f"UTF-8: {len(utf8)} bytes")  # 15 bytes

# UTF-16
utf16 = text.encode('utf-16')
print(f"UTF-16: {len(utf16)} bytes")  # 12 bytes (BOM含む)

# Shift_JIS（レガシー）
sjis = text.encode('shift_jis')
print(f"Shift_JIS: {len(sjis)} bytes")  # 10 bytes

# EUC-JP（レガシー）
eucjp = text.encode('euc-jp')
print(f"EUC-JP: {len(eucjp)} bytes")  # 10 bytes

エラーハンドリング

# エンコードできない文字がある場合の処理
text = "Hello 🌍 World"

# strict（デフォルト）: エラーを発生
try:
    result = text.encode('ascii', errors='strict')
except UnicodeEncodeError as e:
    print(f"Error: {e}")

# ignore: エンコードできない文字を無視
result = text.encode('ascii', errors='ignore')
print(result)  # b'Hello  World'

# replace: ?で置換
result = text.encode('ascii', errors='replace')
print(result)  # b'Hello ? World'

# xmlcharrefreplace: XML文字参照で置換
result = text.encode('ascii', errors='xmlcharrefreplace')
print(result)  # b'Hello &#127757; World'

# backslashreplace: バックスラッシュエスケープで置換
result = text.encode('ascii', errors='backslashreplace')
print(result)  # b'Hello \\U0001f30d World'

bytes() コンストラクタ

bytes()はより汎用的なコンストラクタで、複数の方法でバイト列を作成できます。

# 文字列からバイト列を作成
text = "Hello"
bytes_data = bytes(text, 'utf-8')
print(bytes_data)  # b'Hello'

# 整数のリストから作成
bytes_from_list = bytes([72, 101, 108, 108, 111])
print(bytes_from_list)  # b'Hello'

# 指定サイズのゼロ埋めバイト列
zero_bytes = bytes(10)
print(zero_bytes)  # b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'

# 空のバイト列
empty = bytes()
print(empty)  # b''

バイト列から文字列への変換（decode）

# バイト列から文字列に戻す
bytes_data = b'\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf'
text = bytes_data.decode('utf-8')
print(text)  # こんにちは

# エラーハンドリング
invalid_bytes = b'\xff\xfe'
try:
    text = invalid_bytes.decode('utf-8')
except UnicodeDecodeError as e:
    print(f"Decode error: {e}")

# エラーを無視
text = invalid_bytes.decode('utf-8', errors='ignore')
print(text)  # （空文字列）

# エラーを置換
text = invalid_bytes.decode('utf-8', errors='replace')
print(text)  # ��

実践的な使用例

ファイルの読み書き

# バイナリモードでファイルに書き込み
text = "日本語のテキスト"
with open('output.bin', 'wb') as f:
    f.write(text.encode('utf-8'))

# バイナリモードでファイルを読み込み
with open('output.bin', 'rb') as f:
    bytes_data = f.read()
    text = bytes_data.decode('utf-8')
    print(text)  # 日本語のテキスト

HTTPリクエスト/レスポンス

import urllib.request

# URLからデータを取得（bytes）
url = "https://example.com"
with urllib.request.urlopen(url) as response:
    bytes_data = response.read()
    # 文字列に変換
    html = bytes_data.decode('utf-8')
    print(html[:100])

Base64エンコーディング

import base64

# 文字列をBase64でエンコード
text = "Hello, World!"
bytes_data = text.encode('utf-8')
base64_bytes = base64.b64encode(bytes_data)
base64_str = base64_bytes.decode('ascii')
print(base64_str)  # SGVsbG8sIFdvcmxkIQ==

# Base64からデコード
decoded_bytes = base64.b64decode(base64_str)
original_text = decoded_bytes.decode('utf-8')
print(original_text)  # Hello, World!

ハッシュ計算

import hashlib

text = "パスワード"

# SHA-256ハッシュを計算
bytes_data = text.encode('utf-8')
hash_object = hashlib.sha256(bytes_data)
hash_hex = hash_object.hexdigest()
print(hash_hex)

ソケット通信

import socket

def send_message(host: str, port: int, message: str):
    """メッセージを送信"""
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.connect((host, port))
        # 文字列をバイト列に変換して送信
        s.sendall(message.encode('utf-8'))

        # レスポンスを受信してデコード
        response = s.recv(1024)
        return response.decode('utf-8')

strとbytesの違い

# str: Unicode文字列
s = "Hello"
print(type(s))     # <class 'str'>
print(len(s))      # 5（文字数）
print(s[0])        # 'H'（文字）

# bytes: バイト列
b = b"Hello"
print(type(b))     # <class 'bytes'>
print(len(b))      # 5（バイト数）
print(b[0])        # 72（ASCIIコード）

# 日本語の場合
s = "あ"
b = s.encode('utf-8')
print(len(s))      # 1（1文字）
print(len(b))      # 3（3バイト）

bytearrayとの違い

# bytes: イミュータブル（変更不可）
b = b"Hello"
# b[0] = 74  # TypeError: 'bytes' object does not support item assignment

# bytearray: ミュータブル（変更可能）
ba = bytearray(b"Hello")
ba[0] = 74  # 'J'のASCIIコード
print(ba)   # bytearray(b'Jello')

# 変換
text = "Hello"
ba = bytearray(text, 'utf-8')
ba.extend(b" World")
result = ba.decode('utf-8')
print(result)  # Hello World

よくあるエラーと対処法

# TypeError: a bytes-like object is required, not 'str'
# 原因: バイナリモードのファイルに文字列を書き込もうとした
with open('file.bin', 'wb') as f:
    # f.write("Hello")  # TypeError
    f.write("Hello".encode('utf-8'))  # 正しい

# UnicodeDecodeError
# 原因: 間違ったエンコーディングでデコード
bytes_data = "こんにちは".encode('shift_jis')
try:
    text = bytes_data.decode('utf-8')  # Error
except UnicodeDecodeError:
    text = bytes_data.decode('shift_jis')  # 正しい

# UnicodeEncodeError
# 原因: エンコーディングがサポートしていない文字
text = "絵文字: 🎉"
try:
    bytes_data = text.encode('ascii')  # Error
except UnicodeEncodeError:
    bytes_data = text.encode('utf-8')  # 正しい

まとめ

操作	推奨方法
文字列 → バイト列	`s.encode('utf-8')`
バイト列 → 文字列	`b.decode('utf-8')`
バイトリテラル	`b'ascii only'`
可変バイト列	`bytearray(s, 'utf-8')`

encode()メソッドを使うのが最も直感的で、対となるdecode()メソッドと組み合わせて使用できるため推奨されます。エンコーディングは明示的に指定し、必要に応じてエラーハンドリングを設定しましょう。