Ziele Untersuchung
mit Columbo Integrität von
Datenbanken Interaktion und
Portierbarkeit Ergonomie der
Schnittstellen

Angebot Produkte Projekt Beratung

Mittel Analytik Modellierung Sprachen Algebra Logik Hardware Denken Kreativität

Zusammenhänge Gesellschaft Wirtschaft Branche Firma


products/Sources/formale Sprachen/Isabelle/HOL/Probability/ (Beweissystem Isabelle Version 2025-1^©) Datei vom 16.11.2025 mit Größe 101 kB

Quelle Information.thy

Sprache: Isabelle

(*  Title:      HOL/Probability/Information.thy
Author: Johannes Hölzl, TU München
Author: Armin Heller, TU München
*)

section ‹Information theory›

theory Information
imports
  Independent_Family
begin

subsection "Information theory"

locale information_space = prob_space +
  fixes b :: real assumes b_gt_1: "1 < b"

text ‹Introduce some simplification rules for logarithm of base 🍋‹b›.›
lemmas log_simps = log_mult log_inverse log_divide

subsection "Kullback$-$Leibler divergence"

text ‹The Kullback$-$Leibler divergence is also known as relative entropy or
Kullback$-$Leibler distance.›

definition
  "entropy_density b M N = log b ∘ enn2real ∘ RN_deriv M N"

definition
  "KL_divergence b M N = integral🪙L N (entropy_density b M N)"

lemma measurable_entropy_density[measurable]: "entropy_density b M N ∈ borel_measurable M"
  unfolding entropy_density_def by auto

lemma (in sigma_finite_measure) KL_density:
  fixes f :: "'a ==> real"
  assumes "1 < b"
  assumes f[measurable]: "f ∈ borel_measurable M" and nn: "AE x in M. 0 ≤ f x"
  shows "KL_divergence b M (density M f) = (∫x. f x * log b (f x) ∂M)"
  unfolding KL_divergence_def
proof (subst integral_real_density)
  show [measurable]: "entropy_density b M (density M (λx. ennreal (f x))) ∈ borel_measurable M"
    using f
    by (auto simp: comp_def entropy_density_def)
  have "density M (RN_deriv M (density M f)) = density M f"
    using f nn by (intro density_RN_deriv_density) auto
  then have eq: "AE x in M. RN_deriv M (density M f) x = f x"
    using f nn by (intro density_unique) auto
  have "AE x in M. f x * entropy_density b M (density M (λx. ennreal (f x))) x =
               f x * log b (f x)"
    using eq nn by (auto simp: entropy_density_def)
  then show "(∫x. f x * entropy_density b M (density M (λx. ennreal (f x))) x ∂M) = (∫x. f x * log b (f x) ∂M)"
    by (intro integral_cong_AE) measurable
qed fact+

lemma (in sigma_finite_measure) KL_density_density:
  fixes f g :: "'a ==> real"
  assumes "1 < b"
  assumes f: "f ∈ borel_measurable M" "AE x in M. 0 ≤ f x"
  assumes g: "g ∈ borel_measurable M" "AE x in M. 0 ≤ g x"
  assumes ac: "AE x in M. f x = 0 ⟶ g x = 0"
  shows "KL_divergence b (density M f) (density M g) = (∫x. g x * log b (g x / f x) ∂M)"
proof -
  interpret Mf: sigma_finite_measure "density M f"
    using f by (subst sigma_finite_iff_density_finite) auto
  have "KL_divergence b (density M f) (density M g) =
    KL_divergence b (density M f) (density (density M f) (λx. g x / f x))"
    using f g ac by (subst density_density_divide) simp_all
  also have "… = (∫x. (g x / f x) * log b (g x / f x) ∂density M f)"
    using f g ‹1 🚫› by (intro Mf.KL_density) (auto simp: AE_density)
  also have "… = (∫x. g x * log b (g x / f x) ∂M)"
    using ac f g ‹1 🚫› by (subst integral_density) (auto intro!: integral_cong_AE)
  finally show ?thesis .
qed

lemma (in information_space) KL_gt_0:
  fixes D :: "'a ==> real"
  assumes "prob_space (density M D)"
  assumes D: "D ∈ borel_measurable M" "AE x in M. 0 ≤ D x"
  assumes int: "integrable M (λx. D x * log b (D x))"
  assumes A: "density M D ≠ M"
  shows "0 < KL_divergence b M (density M D)"
proof -
  interpret N: prob_space "density M D" by fact

  obtain A where "A ∈ sets M" "emeasure (density M D) A ≠ emeasure M A"
    using measure_eqI[of "density M D" M] ‹density M D ≠ M› by auto

  let ?D_set = "{x∈space M. D x ≠ 0}"
  have [simp, intro]: "?D_set ∈ sets M"
    using D by auto

  have D_neg: "(∫🪙+ x. ennreal (- D x) ∂M) = 0"
    using D by (subst nn_integral_0_iff_AE) (auto simp: ennreal_neg)

  have "(∫🪙+ x. ennreal (D x) ∂M) = emeasure (density M D) (space M)"
    using D by (simp add: emeasure_density cong: nn_integral_cong)
  then have D_pos: "(∫🪙+ x. ennreal (D x) ∂M) = 1"
    using N.emeasure_space_1 by simp

  have "integrable M D"
    using D D_pos D_neg unfolding real_integrable_def real_lebesgue_integral_def by simp_all
  then have "integral🪙L M D = 1"
    using D D_pos D_neg by (simp add: real_lebesgue_integral_def)

  have "0 ≤ 1 - measure M ?D_set"
    using prob_le_1 by (auto simp: field_simps)
  also have "… = (∫ x. D x - indicator ?D_set x ∂M)"
    using ‹integrable M D› ‹integral🪙L M D = 1›
    by (simp add: emeasure_eq_measure)
  also have "… < (∫ x. D x * (ln b * log b (D x)) ∂M)"
  proof (rule integral_less_AE)
    show "integrable M (λx. D x - indicator ?D_set x)"
      using ‹integrable M D› by (auto simp: less_top[symmetric])
  next
    from integrable_mult_left(1)[OF int, of "ln b"]
    show "integrable M (λx. D x * (ln b * log b (D x)))"
      by (simp add: ac_simps)
  next
    show "emeasure M {x∈space M. D x ≠ 1 ∧ D x ≠ 0} ≠ 0"
    proof
      assume eq_0: "emeasure M {x∈space M. D x ≠ 1 ∧ D x ≠ 0} = 0"
      then have disj: "AE x in M. D x = 1 ∨ D x = 0"
        using D(1) by (auto intro!: AE_I[OF subset_refl] sets.sets_Collect)

      have "emeasure M {x∈space M. D x = 1} = (∫🪙+ x. indicator {x∈space M. D x = 1} x ∂M)"
        using D(1) by auto
      also have "… = (∫🪙+ x. ennreal (D x) ∂M)"
        using disj by (auto intro!: nn_integral_cong_AE simp: indicator_def one_ennreal_def)
      finally have "AE x in M. D x = 1"
        using D D_pos by (intro AE_I_eq_1) auto
      then have "(∫🪙+x. indicator A x∂M) = (∫🪙+x. ennreal (D x) * indicator A x∂M)"
        by (intro nn_integral_cong_AE) (auto simp: one_ennreal_def[symmetric])
      also have "… = density M D A"
        using ‹A ∈ sets M› D by (simp add: emeasure_density)
      finally show False using ‹A ∈ sets M› ‹emeasure (density M D) A ≠ emeasure M A› by simp
    qed
    show "{x∈space M. D x ≠ 1 ∧ D x ≠ 0} ∈ sets M"
      using D(1) by (auto intro: sets.sets_Collect_conj)

    have False
      if Dt: "t ∈ space M" "D t ≠ 1" "D t ≠ 0" "0 ≤ D t"
        and eq: "D t - indicator ?D_set t = D t * (ln b * log b (D t))" for t
    proof -
      have "D t - 1 = D t - indicator ?D_set t"
        using Dt by simp
      also note eq
      also have "D t * (ln b * log b (D t)) = - D t * ln (1 / D t)"
        using b_gt_1 ‹D t ≠ 0› ‹0 ≤ D t›
        by (simp add: log_def ln_div less_le)
      finally have "ln (1 / D t) = 1 / D t - 1"
        using ‹D t ≠ 0› by (auto simp: field_simps)
      from ln_eq_minus_one[OF _ this] ‹D t ≠ 0› ‹0 ≤ D t› ‹D t ≠ 1›
      show False by auto
    qed
    with D(2)
    show "AE t in M. t ∈ {x∈space M. D x ≠ 1 ∧ D x ≠ 0} ⟶
      D t - indicator ?D_set t ≠ D t * (ln b * log b (D t))"
      by fastforce

    show "AE t in M. D t - indicator ?D_set t ≤ D t * (ln b * log b (D t))"
      using D(2) AE_space
    proof eventually_elim
      fix t assume "t ∈ space M" "0 ≤ D t"
      show "D t - indicator ?D_set t ≤ D t * (ln b * log b (D t))"
      proof cases
        assume asm: "D t ≠ 0"
        then have "0 < D t" using ‹0 ≤ D t› by auto
        then have "0 < 1 / D t" by auto
        have "D t - indicator ?D_set t ≤ - D t * (1 / D t - 1)"
          using asm ‹t ∈ space M› by (simp add: field_simps)
        also have "- D t * (1 / D t - 1) ≤ - D t * ln (1 / D t)"
          using ln_le_minus_one ‹0 🚫 / D t› by (intro mult_left_mono_neg) auto
        also have "… = D t * (ln b * log b (D t))"
          using ‹0 🚫 t› b_gt_1
          by (simp_all add: log_def ln_div)
        finally show ?thesis by simp
      qed simp
    qed
  qed
  also have "… = (∫ x. ln b * (D x * log b (D x)) ∂M)"
    by (simp add: ac_simps)
  also have "… = ln b * (∫ x. D x * log b (D x) ∂M)"
    using int by simp
  finally show ?thesis
    using b_gt_1 D by (subst KL_density) (auto simp: zero_less_mult_iff)
qed

lemma (in sigma_finite_measure) KL_same_eq_0: "KL_divergence b M M = 0"
proof -
  have "AE x in M. 1 = RN_deriv M M x"
  proof (rule RN_deriv_unique)
    show "density M (λx. 1) = M"
      by (simp add: density_1)
  qed auto
  then have "AE x in M. log b (enn2real (RN_deriv M M x)) = 0"
    by (elim AE_mp) simp
  from integral_cong_AE[OF _ _ this]
  have "integral🪙L M (entropy_density b M M) = 0"
    by (simp add: entropy_density_def comp_def)
  then show "KL_divergence b M M = 0"
    unfolding KL_divergence_def
    by auto
qed

lemma (in information_space) KL_eq_0_iff_eq:
  fixes D :: "'a ==> real"
  assumes "prob_space (density M D)"
  assumes D: "D ∈ borel_measurable M" "AE x in M. 0 ≤ D x"
  assumes int: "integrable M (λx. D x * log b (D x))"
  shows "KL_divergence b M (density M D) = 0 ⟷ density M D = M"
  using KL_same_eq_0[of b] KL_gt_0[OF assms]
  by (auto simp: less_le)

lemma (in information_space) KL_eq_0_iff_eq_ac:
  fixes D :: "'a ==> real"
  assumes "prob_space N"
  assumes ac: "absolutely_continuous M N" "sets N = sets M"
  assumes int: "integrable N (entropy_density b M N)"
  shows "KL_divergence b M N = 0 ⟷ N = M"
proof -
  interpret N: prob_space N by fact
  have "finite_measure N" by unfold_locales
  from real_RN_deriv[OF this ac] obtain D
    where D:
      "random_variable borel D"
      "AE x in M. RN_deriv M N x = ennreal (D x)"
      "AE x in N. 0 < D x"
      "∧x. 0 ≤ D x"
    by this auto

  have "N = density M (RN_deriv M N)"
    using ac by (rule density_RN_deriv[symmetric])
  also have "… = density M D"
    using D by (auto intro!: density_cong)
  finally have N: "N = density M D" .

  from absolutely_continuous_AE[OF ac(2,1) D(2)] D b_gt_1 ac measurable_entropy_density
  have "integrable N (λx. log b (D x))"
    by (intro integrable_cong_AE[THEN iffD2, OF _ _ _ int])
       (auto simp: N entropy_density_def)
  with D b_gt_1 have "integrable M (λx. D x * log b (D x))"
    by (subst integrable_real_density[symmetric]) (auto simp: N[symmetric] comp_def)
  with ‹prob_space N› D show ?thesis
    unfolding N
    by (intro KL_eq_0_iff_eq) auto
qed

lemma (in information_space) KL_nonneg:
  assumes "prob_space (density M D)"
  assumes D: "D ∈ borel_measurable M" "AE x in M. 0 ≤ D x"
  assumes int: "integrable M (λx. D x * log b (D x))"
  shows "0 ≤ KL_divergence b M (density M D)"
  using KL_gt_0[OF assms] by (cases "density M D = M") (auto simp: KL_same_eq_0)

lemma (in sigma_finite_measure) KL_density_density_nonneg:
  fixes f g :: "'a ==> real"
  assumes "1 < b"
  assumes f: "f ∈ borel_measurable M" "AE x in M. 0 ≤ f x" "prob_space (density M f)"
  assumes g: "g ∈ borel_measurable M" "AE x in M. 0 ≤ g x" "prob_space (density M g)"
  assumes ac: "AE x in M. f x = 0 ⟶ g x = 0"
  assumes int: "integrable M (λx. g x * log b (g x / f x))"
  shows "0 ≤ KL_divergence b (density M f) (density M g)"
proof -
  interpret Mf: prob_space "density M f" by fact
  interpret Mf: information_space "density M f" b by standard fact
  have eq: "density (density M f) (λx. g x / f x) = density M g" (is "?DD = _")
    using f g ac by (subst density_density_divide) simp_all

  have "0 ≤ KL_divergence b (density M f) (density (density M f) (λx. g x / f x))"
  proof (rule Mf.KL_nonneg)
    show "prob_space ?DD" unfolding eq by fact
    from f g show "(λx. g x / f x) ∈ borel_measurable (density M f)"
      by auto
    show "AE x in density M f. 0 ≤ g x / f x"
      using f g by (auto simp: AE_density)
    show "integrable (density M f) (λx. g x / f x * log b (g x / f x))"
      using ‹1 🚫› f g ac
      by (subst integrable_density)
         (auto intro!: integrable_cong_AE[THEN iffD2, OF _ _ _ int] measurable_If)
  qed
  also have "… = KL_divergence b (density M f) (density M g)"
    using f g ac by (subst density_density_divide) simp_all
  finally show ?thesis .
qed

subsection ‹Finite Entropy›

definition (in information_space) finite_entropy :: "'b measure ==> ('a ==> 'b) ==> ('b ==> real) ==> bool"
where
  "finite_entropy S X f ⟷
    distributed M S X f ∧
    integrable S (λx. f x * log b (f x)) ∧
    (∀x∈space S. 0 ≤ f x)"

lemma (in information_space) finite_entropy_simple_function:
  assumes X: "simple_function M X"
  shows "finite_entropy (count_space (X`space M)) X (λa. measure M {x ∈ space M. X x = a})"
  unfolding finite_entropy_def
proof safe
  have [simp]: "finite (X ` space M)"
    using X by (auto simp: simple_function_def)
  then show "integrable (count_space (X ` space M))
     (λx. prob {xa ∈ space M. X xa = x} * log b (prob {xa ∈ space M. X xa = x}))"
    by (rule integrable_count_space)
  have d: "distributed M (count_space (X ` space M)) X (λx. ennreal (if x ∈ X`space M then prob {xa ∈ space M. X xa = x} else 0))"
    by (rule distributed_simple_function_superset[OF X]) (auto intro!: arg_cong[where f=prob])
  show "distributed M (count_space (X ` space M)) X (λx. ennreal (prob {xa ∈ space M. X xa = x}))"
    by (rule distributed_cong_density[THEN iffD1, OF _ _ _ d]) auto
qed (rule measure_nonneg)

lemma ac_fst:
  assumes "sigma_finite_measure T"
  shows "absolutely_continuous S (distr (S ⨂🪙M T) S fst)"
proof -
  interpret sigma_finite_measure T by fact
  { fix A assume A: "A ∈ sets S" "emeasure S A = 0"
    then have "fst -` A ∩ space (S ⨂🪙M T) = A × space T"
      by (auto simp: space_pair_measure dest!: sets.sets_into_space)
    with A have "emeasure (S ⨂🪙M T) (fst -` A ∩ space (S ⨂🪙M T)) = 0"
      by (simp add: emeasure_pair_measure_Times) }
  then show ?thesis
    unfolding absolutely_continuous_def
    by (metis emeasure_distr measurable_fst null_setsD1 null_setsD2 null_setsI sets_distr subsetI)
qed

lemma ac_snd:
  assumes "sigma_finite_measure T"
  shows "absolutely_continuous T (distr (S ⨂🪙M T) T snd)"
proof -
  interpret sigma_finite_measure T by fact
  { fix A assume A: "A ∈ sets T" "emeasure T A = 0"
    then have "snd -` A ∩ space (S ⨂🪙M T) = space S × A"
      by (auto simp: space_pair_measure dest!: sets.sets_into_space)
    with A have "emeasure (S ⨂🪙M T) (snd -` A ∩ space (S ⨂🪙M T)) = 0"
      by (simp add: emeasure_pair_measure_Times) }
  then show ?thesis
    unfolding absolutely_continuous_def
    by (metis emeasure_distr measurable_snd null_setsD1 null_setsD2 null_setsI sets_distr subsetI)
qed

lemma (in information_space) finite_entropy_integrable:
  "finite_entropy S X Px ==> integrable S (λx. Px x * log b (Px x))"
  unfolding finite_entropy_def by auto

lemma (in information_space) finite_entropy_distributed:
  "finite_entropy S X Px ==> distributed M S X Px"
  unfolding finite_entropy_def by auto

lemma (in information_space) finite_entropy_nn:
  "finite_entropy S X Px ==> x ∈ space S ==> 0 ≤ Px x"
  by (auto simp: finite_entropy_def)

lemma (in information_space) finite_entropy_measurable:
  "finite_entropy S X Px ==> Px ∈ S →🪙M borel"
  using distributed_real_measurable[of S Px M X]
    finite_entropy_nn[of S X Px] finite_entropy_distributed[of S X Px] by auto

lemma (in information_space) subdensity_finite_entropy:
  fixes g :: "'b ==> real" and f :: "'c ==> real"
  assumes T: "T ∈ measurable P Q"
  assumes f: "finite_entropy P X f"
  assumes g: "finite_entropy Q Y g"
  assumes Y: "Y = T ∘ X"
  shows "AE x in P. g (T x) = 0 ⟶ f x = 0"
  using subdensity[OF T, of M X "λx. ennreal (f x)" Y "λx. ennreal (g x)"]
    finite_entropy_distributed[OF f] finite_entropy_distributed[OF g]
    finite_entropy_nn[OF f] finite_entropy_nn[OF g]
    assms
  by auto

lemma (in information_space) finite_entropy_integrable_transform:
  "finite_entropy S X Px ==> distributed M T Y Py ==> (∧x. x ∈ space T ==> 0 ≤ Py x) ==>
    X = (λx. f (Y x)) ==> f ∈ measurable T S ==> integrable T (λx. Py x * log b (Px (f x)))"
  using distributed_transform_integrable[of M T Y Py S X Px f "λx. log b (Px x)"]
  using distributed_real_measurable[of S Px M X]
  by (auto simp: finite_entropy_def)

subsection ‹Mutual Information›

definition (in prob_space)
  "mutual_information b S T X Y =
    KL_divergence b (distr M S X ⨂🪙M distr M T Y) (distr M (S ⨂🪙M T) (λx. (X x, Y x)))"

lemma (in information_space) mutual_information_indep_vars:
  fixes S T X Y
  defines "P ≡ distr M S X ⨂🪙M distr M T Y"
  defines "Q ≡ distr M (S ⨂🪙M T) (λx. (X x, Y x))"
  shows "indep_var S X T Y ⟷
    (random_variable S X ∧ random_variable T Y ∧
      absolutely_continuous P Q ∧ integrable Q (entropy_density b P Q) ∧
      mutual_information b S T X Y = 0)"
  unfolding indep_var_distribution_eq
proof safe
  assume rv[measurable]: "random_variable S X" "random_variable T Y"

  interpret X: prob_space "distr M S X"
    by (rule prob_space_distr) fact
  interpret Y: prob_space "distr M T Y"
    by (rule prob_space_distr) fact
  interpret XY: pair_prob_space "distr M S X" "distr M T Y" by standard
  interpret P: information_space P b unfolding P_def by standard (rule b_gt_1)

  interpret Q: prob_space Q unfolding Q_def
    by (rule prob_space_distr) simp

  { assume "distr M S X ⨂🪙M distr M T Y = distr M (S ⨂🪙M T) (λx. (X x, Y x))"
    then have [simp]: "Q = P"  unfolding Q_def P_def by simp

    show ac: "absolutely_continuous P Q" by (simp add: absolutely_continuous_def)
    then have ed: "entropy_density b P Q ∈ borel_measurable P"
      by simp

    have "AE x in P. 1 = RN_deriv P Q x"
    proof (rule P.RN_deriv_unique)
      show "density P (λx. 1) = Q"
        unfolding ‹Q = P› by (intro measure_eqI) (auto simp: emeasure_density)
    qed auto
    then have ae_0: "AE x in P. entropy_density b P Q x = 0"
      by (auto simp: entropy_density_def)
    then have "integrable P (entropy_density b P Q) ⟷ integrable Q (λx. 0::real)"
      using ed unfolding ‹Q = P› by (intro integrable_cong_AE) auto
    then show "integrable Q (entropy_density b P Q)" by simp

    from ae_0 have "mutual_information b S T X Y = (∫x. 0 ∂P)"
      unfolding mutual_information_def KL_divergence_def P_def[symmetric] Q_def[symmetric] ‹Q = P›
      by (intro integral_cong_AE) auto
    then show "mutual_information b S T X Y = 0"
      by simp }

  { assume ac: "absolutely_continuous P Q"
    assume int: "integrable Q (entropy_density b P Q)"
    assume I_eq_0: "mutual_information b S T X Y = 0"

    have eq: "Q = P"
    proof (rule P.KL_eq_0_iff_eq_ac[THEN iffD1])
      show "prob_space Q" by unfold_locales
      show "absolutely_continuous P Q" by fact
      show "integrable Q (entropy_density b P Q)" by fact
      show "sets Q = sets P" by (simp add: P_def Q_def sets_pair_measure)
      show "KL_divergence b P Q = 0"
        using I_eq_0 unfolding mutual_information_def by (simp add: P_def Q_def)
    qed
    then show "distr M S X ⨂🪙M distr M T Y = distr M (S ⨂🪙M T) (λx. (X x, Y x))"
      unfolding P_def Q_def .. }
qed

abbreviation (in information_space)
  mutual_information_Pow (‹I'(_ ; _')›) where
  "I(X ; Y) ≡ mutual_information b (count_space (X`space M)) (count_space (Y`space M)) X Y"

lemma (in information_space)
  fixes Pxy :: "'b × 'c ==> real" and Px :: "'b ==> real" and Py :: "'c ==> real"
  assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
  assumes Fx: "finite_entropy S X Px" and Fy: "finite_entropy T Y Py"
  assumes Fxy: "finite_entropy (S ⨂🪙M T) (λx. (X x, Y x)) Pxy"
  defines "f ≡ λx. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))"
  shows mutual_information_distr': "mutual_information b S T X Y = integral🪙L (S ⨂??M T) f" (is "?M = ?R")
    and mutual_information_nonneg': "0 ≤ mutual_information b S T X Y"
proof -
  have Px: "distributed M S X Px" and Px_nn: "∧x. x ∈ space S ==> 0 ≤ Px x"
    using Fx by (auto simp: finite_entropy_def)
  have Py: "distributed M T Y Py" and Py_nn: "∧x. x ∈ space T ==> 0 ≤ Py x"
    using Fy by (auto simp: finite_entropy_def)
  have Pxy: "distributed M (S ⨂🪙M T) (λx. (X x, Y x)) Pxy"
    and Pxy_nn: "∧x. x ∈ space (S ⨂🪙M T) ==> 0 ≤ Pxy x"
      "∧x y. x ∈ space S ==> y ∈ space T ==> 0 ≤ Pxy (x, y)"
    using Fxy by (auto simp: finite_entropy_def space_pair_measure)

  have [measurable]: "Px ∈ S →🪙M borel"
    using Px Px_nn by (intro distributed_real_measurable)
  have [measurable]: "Py ∈ T →🪙M borel"
    using Py Py_nn by (intro distributed_real_measurable)
  have measurable_Pxy[measurable]: "Pxy ∈ (S ⨂🪙M T) →🪙M borel"
    using Pxy Pxy_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure)

  have X[measurable]: "random_variable S X"
    using Px by auto
  have Y[measurable]: "random_variable T Y"
    using Py by auto
  interpret S: sigma_finite_measure S by fact
  interpret T: sigma_finite_measure T by fact
  interpret ST: pair_sigma_finite S T ..
  interpret X: prob_space "distr M S X" using X by (rule prob_space_distr)
  interpret Y: prob_space "distr M T Y" using Y by (rule prob_space_distr)
  interpret XY: pair_prob_space "distr M S X" "distr M T Y" ..
  let ?P = "S ⨂🪙M T"
  let ?D = "distr M ?P (λx. (X x, Y x))"

  { fix A assume "A ∈ sets S"
    with X[THEN measurable_space] Y[THEN measurable_space]
    have "emeasure (distr M S X) A = emeasure ?D (A × space T)"
      by (auto simp: emeasure_distr intro!: arg_cong[where f="emeasure M"]) }
  note marginal_eq1 = this
  { fix A assume "A ∈ sets T"
    with X[THEN measurable_space] Y[THEN measurable_space]
    have "emeasure (distr M T Y) A = emeasure ?D (space S × A)"
      by (auto simp: emeasure_distr intro!: arg_cong[where f="emeasure M"]) }
  note marginal_eq2 = this

  have distr_eq: "distr M S X ⨂🪙M distr M T Y = density ?P (λx. ennreal (Px (fst x) * Py (snd x)))"
    unfolding Px(1)[THEN distributed_distr_eq_density] Py(1)[THEN distributed_distr_eq_density]
  proof (subst pair_measure_density)
    show "(λx. ennreal (Px x)) ∈ borel_measurable S" "(λy. ennreal (Py y)) ∈ borel_measurable T"
      using Px Py by (auto simp: distributed_def)
    show "sigma_finite_measure (density T Py)" unfolding Py(1)[THEN distributed_distr_eq_density, symmetric] ..
    show "density (S ⨂🪙M T) (λ(x, y). ennreal (Px x) * ennreal (Py y)) =
      density (S ⨂🪙M T) (λx. ennreal (Px (fst x) * Py (snd x)))"
      using Px_nn Py_nn by (auto intro!: density_cong simp: distributed_def ennreal_mult space_pair_measure)
  qed fact

  have M: "?M = KL_divergence b (density ?P (λx. ennreal (Px (fst x) * Py (snd x)))) (density ?P (λx. ennreal (Pxy x)))"
    unfolding mutual_information_def distr_eq Pxy(1)[THEN distributed_distr_eq_density] ..

  from Px Py have f: "(λx. Px (fst x) * Py (snd x)) ∈ borel_measurable ?P"
    by (intro borel_measurable_times) (auto intro: distributed_real_measurable measurable_fst'' measurable_snd'')
  have PxPy_nonneg: "AE x in ?P. 0 ≤ Px (fst x) * Py (snd x)"
    using Px_nn Py_nn by (auto simp: space_pair_measure)

  have A: "(AE x in ?P. Px (fst x) = 0 ⟶ Pxy x = 0)"
    by (rule subdensity_real[OF measurable_fst Pxy Px]) (insert Px_nn Pxy_nn, auto simp: space_pair_measure)
  moreover
  have B: "(AE x in ?P. Py (snd x) = 0 ⟶ Pxy x = 0)"
    by (rule subdensity_real[OF measurable_snd Pxy Py]) (insert Py_nn Pxy_nn, auto simp: space_pair_measure)
  ultimately have ac: "AE x in ?P. Px (fst x) * Py (snd x) = 0 ⟶ Pxy x = 0"
    by auto

  show "?M = ?R"
    unfolding M f_def using Pxy_nn Px_nn Py_nn
    by (intro ST.KL_density_density b_gt_1 f PxPy_nonneg ac) (auto simp: space_pair_measure)

  have X: "X = fst ∘ (λx. (X x, Y x))" and Y: "Y = snd ∘ (λx. (X x, Y x))"
    by auto

  have "integrable (S ⨂🪙M T) (λx. Pxy x * log b (Pxy x) - Pxy x * log b (Px (fst x)) - Pxy x * log b (Py (snd x)))"
    using finite_entropy_integrable[OF Fxy]
    using finite_entropy_integrable_transform[OF Fx Pxy, of fst]
    using finite_entropy_integrable_transform[OF Fy Pxy, of snd]
    by (simp add: Pxy_nn)
  moreover have "f ∈ borel_measurable (S ⨂🪙M T)"
    unfolding f_def using Px Py Pxy
    by (auto intro: distributed_real_measurable measurable_fst'' measurable_snd''
      intro!: borel_measurable_times borel_measurable_log borel_measurable_divide)
  ultimately have int: "integrable (S ⨂🪙M T) f"
  proof (rule integrable_cong_AE_imp)
    show "AE x in S ⨂🪙M
            T. Pxy x * log b (Pxy x) - Pxy x * log b (Px (fst x)) - Pxy x * log b (Py (snd x))
          = f x"
      using A B
      by (auto simp: f_def field_simps space_pair_measure log_mult log_divide)
  qed

  show "0 ≤ ?M" unfolding M
  proof (intro ST.KL_density_density_nonneg)
    show "prob_space (density (S ⨂🪙M T) (λx. ennreal (Pxy x))) "
      unfolding distributed_distr_eq_density[OF Pxy, symmetric]
      using distributed_measurable[OF Pxy] by (rule prob_space_distr)
    show "prob_space (density (S ⨂🪙M T) (λx. ennreal (Px (fst x) * Py (snd x))))"
      unfolding distr_eq[symmetric] by unfold_locales
    show "integrable (S ⨂🪙M T) (λx. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))))"
      using int unfolding f_def .
  qed (insert ac, auto simp: b_gt_1 Px_nn Py_nn Pxy_nn space_pair_measure)
qed

lemma (in information_space)
  fixes Pxy :: "'b × 'c ==> real" and Px :: "'b ==> real" and Py :: "'c ==> real"
  assumes "sigma_finite_measure S" "sigma_finite_measure T"
  assumes Px: "distributed M S X Px" and Px_nn: "∧x. x ∈ space S ==> 0 ≤ Px x"
    and Py: "distributed M T Y Py" and Py_nn: "∧y. y ∈ space T ==> 0 ≤ Py y"
    and Pxy: "distributed M (S ⨂🪙M T) (λx. (X x, Y x)) Pxy"
    and Pxy_nn: "∧x y. x ∈ space S ==> y ∈ space T ==> 0 ≤ Pxy (x, y)"
  defines "f ≡ λx. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))"
  shows mutual_information_distr: "mutual_information b S T X Y = integral🪙L (S ⨂🪙M T) f" (is "?M = ?R")
    and mutual_information_nonneg: "integrable (S ⨂🪙M T) f ==> 0 ≤ mutual_information b S T X Y"
proof -
  have X[measurable]: "random_variable S X"
    using Px by (auto simp: distributed_def)
  have Y[measurable]: "random_variable T Y"
    using Py by (auto simp: distributed_def)
  have [measurable]: "Px ∈ S →🪙M borel"
    using Px Px_nn by (intro distributed_real_measurable)
  have [measurable]: "Py ∈ T →🪙M borel"
    using Py Py_nn by (intro distributed_real_measurable)
  have measurable_Pxy[measurable]: "Pxy ∈ (S ⨂🪙M T) →🪙M borel"
    using Pxy Pxy_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure)

  interpret S: sigma_finite_measure S by fact
  interpret T: sigma_finite_measure T by fact
  interpret ST: pair_sigma_finite S T ..
  interpret X: prob_space "distr M S X" using X by (rule prob_space_distr)
  interpret Y: prob_space "distr M T Y" using Y by (rule prob_space_distr)
  interpret XY: pair_prob_space "distr M S X" "distr M T Y" ..
  let ?P = "S ⨂🪙M T"
  let ?D = "distr M ?P (λx. (X x, Y x))"

  { fix A assume "A ∈ sets S"
    with X[THEN measurable_space] Y[THEN measurable_space]
    have "emeasure (distr M S X) A = emeasure ?D (A × space T)"
      by (auto simp: emeasure_distr intro!: arg_cong[where f="emeasure M"]) }
  note marginal_eq1 = this
  { fix A assume "A ∈ sets T"
    with X[THEN measurable_space] Y[THEN measurable_space]
    have "emeasure (distr M T Y) A = emeasure ?D (space S × A)"
      by (auto simp: emeasure_distr intro!: arg_cong[where f="emeasure M"]) }
  note marginal_eq2 = this

  have distr_eq: "distr M S X ⨂🪙M distr M T Y = density ?P (λx. ennreal (Px (fst x) * Py (snd x)))"
    unfolding Px(1)[THEN distributed_distr_eq_density] Py(1)[THEN distributed_distr_eq_density]
  proof (subst pair_measure_density)
    show "(λx. ennreal (Px x)) ∈ borel_measurable S" "(λy. ennreal (Py y)) ∈ borel_measurable T"
      using Px Py by (auto simp: distributed_def)
    show "sigma_finite_measure (density T Py)" unfolding Py(1)[THEN distributed_distr_eq_density, symmetric] ..
    show "density (S ⨂🪙M T) (λ(x, y). ennreal (Px x) * ennreal (Py y)) =
      density (S ⨂🪙M T) (λx. ennreal (Px (fst x) * Py (snd x)))"
      using Px_nn Py_nn by (auto intro!: density_cong simp: distributed_def ennreal_mult space_pair_measure)
  qed fact

  have M: "?M = KL_divergence b (density ?P (λx. ennreal (Px (fst x) * Py (snd x)))) (density ?P (λx. ennreal (Pxy x)))"
    unfolding mutual_information_def distr_eq Pxy(1)[THEN distributed_distr_eq_density] ..

  from Px Py have f: "(λx. Px (fst x) * Py (snd x)) ∈ borel_measurable ?P"
    by (intro borel_measurable_times) (auto intro: distributed_real_measurable measurable_fst'' measurable_snd'')
  have PxPy_nonneg: "AE x in ?P. 0 ≤ Px (fst x) * Py (snd x)"
    using Px_nn Py_nn by (auto simp: space_pair_measure)

  have "(AE x in ?P. Px (fst x) = 0 ⟶ Pxy x = 0)"
    by (rule subdensity_real[OF measurable_fst Pxy Px]) (insert Px_nn Pxy_nn, auto simp: space_pair_measure)
  moreover
  have "(AE x in ?P. Py (snd x) = 0 ⟶ Pxy x = 0)"
    by (rule subdensity_real[OF measurable_snd Pxy Py]) (insert Py_nn Pxy_nn, auto simp: space_pair_measure)
  ultimately have ac: "AE x in ?P. Px (fst x) * Py (snd x) = 0 ⟶ Pxy x = 0"
    by auto

  show "?M = ?R"
    unfolding M f_def
    using b_gt_1 f PxPy_nonneg ac Pxy_nn
    by (intro ST.KL_density_density) (auto simp: space_pair_measure)

  assume int: "integrable (S ⨂🪙M T) f"
  show "0 ≤ ?M" unfolding M
  proof (intro ST.KL_density_density_nonneg)
    show "prob_space (density (S ⨂🪙M T) (λx. ennreal (Pxy x))) "
      unfolding distributed_distr_eq_density[OF Pxy, symmetric]
      using distributed_measurable[OF Pxy] by (rule prob_space_distr)
    show "prob_space (density (S ⨂🪙M T) (λx. ennreal (Px (fst x) * Py (snd x))))"
      unfolding distr_eq[symmetric] by unfold_locales
    show "integrable (S ⨂🪙M T) (λx. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))))"
      using int unfolding f_def .
  qed (insert ac, auto simp: b_gt_1 Px_nn Py_nn Pxy_nn space_pair_measure)
qed

lemma (in information_space)
  fixes Pxy :: "'b × 'c ==> real" and Px :: "'b ==> real" and Py :: "'c ==> real"
  assumes "sigma_finite_measure S" "sigma_finite_measure T"
  assumes Px[measurable]: "distributed M S X Px" and Px_nn: "∧x. x ∈ space S ==> 0 ≤ Px x"
    and Py[measurable]: "distributed M T Y Py" and Py_nn:  "∧x. x ∈ space T ==> 0 ≤ Py x"
    and Pxy[measurable]: "distributed M (S ⨂🪙M T) (λx. (X x, Y x)) Pxy"
    and Pxy_nn: "∧x. x ∈ space (S ⨂🪙M T) ==> 0 ≤ Pxy x"
  assumes ae: "AE x in S. AE y in T. Pxy (x, y) = Px x * Py y"
  shows mutual_information_eq_0: "mutual_information b S T X Y = 0"
proof -
  interpret S: sigma_finite_measure S by fact
  interpret T: sigma_finite_measure T by fact
  interpret ST: pair_sigma_finite S T ..
  note
    distributed_real_measurable[OF Px_nn Px, measurable]
    distributed_real_measurable[OF Py_nn Py, measurable]
    distributed_real_measurable[OF Pxy_nn Pxy, measurable]

  have "AE x in S ⨂🪙M T. Px (fst x) = 0 ⟶ Pxy x = 0"
    by (rule subdensity_real[OF measurable_fst Pxy Px]) (auto simp: Px_nn Pxy_nn space_pair_measure)
  moreover
  have "AE x in S ⨂🪙M T. Py (snd x) = 0 ⟶ Pxy x = 0"
    by (rule subdensity_real[OF measurable_snd Pxy Py]) (auto simp: Py_nn Pxy_nn space_pair_measure)
  moreover
  have "AE x in S ⨂🪙M T. Pxy x = Px (fst x) * Py (snd x)"
    by (intro ST.AE_pair_measure) (auto simp: ae intro!: measurable_snd'' measurable_fst'')
  ultimately have "AE x in S ⨂🪙M T. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) = 0"
    by eventually_elim simp
  then have "(∫x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) ∂(S ⨂🪙M T)) = (∫x. 0 ∂(S ⨂🪙M T))"
    by (intro integral_cong_AE) auto
  then show ?thesis
    by (subst mutual_information_distr[OF assms(1-8)]) (auto simp add: space_pair_measure)
qed

lemma (in information_space) mutual_information_simple_distributed:
  assumes X: "simple_distributed M X Px" and Y: "simple_distributed M Y Py"
  assumes XY: "simple_distributed M (λx. (X x, Y x)) Pxy"
  shows "I(X ; Y) = (∑(x, y)∈(λx. (X x, Y x))`space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y)))"
proof (subst mutual_information_distr[OF _ _ simple_distributed[OF X] _ simple_distributed[OF Y] _ simple_distributed_joint[OF XY]])
  note fin = simple_distributed_joint_finite[OF XY, simp]
  show "sigma_finite_measure (count_space (X ` space M))"
    by (simp add: sigma_finite_measure_count_space_finite)
  show "sigma_finite_measure (count_space (Y ` space M))"
    by (simp add: sigma_finite_measure_count_space_finite)
  let ?Pxy = "λx. (if x ∈ (λx. (X x, Y x)) ` space M then Pxy x else 0)"
  let ?f = "λx. ?Pxy x * log b (?Pxy x / (Px (fst x) * Py (snd x)))"
  have "∧x. ?f x = (if x ∈ (λx. (X x, Y x)) ` space M then Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) else 0)"
    by auto
  with fin show "(∫ x. ?f x ∂(count_space (X ` space M) ⨂🪙M count_space (Y ` space M))) =
    (∑(x, y)∈(λx. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y)))"
    by (auto simp add: pair_measure_count_space lebesgue_integral_count_space_finite sum.If_cases split_beta'
             intro!: sum.cong)
qed (insert X Y XY, auto simp: simple_distributed_def)

lemma (in information_space)
  fixes Pxy :: "'b × 'c ==> real" and Px :: "'b ==> real" and Py :: "'c ==> real"
  assumes Px: "simple_distributed M X Px" and Py: "simple_distributed M Y Py"
  assumes Pxy: "simple_distributed M (λx. (X x, Y x)) Pxy"
  assumes ae: "∀x∈space M. Pxy (X x, Y x) = Px (X x) * Py (Y x)"
  shows mutual_information_eq_0_simple: "I(X ; Y) = 0"
proof (subst mutual_information_simple_distributed[OF Px Py Pxy])
  have "(∑(x, y)∈(λx. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y))) =
    (∑(x, y)∈(λx. (X x, Y x)) ` space M. 0)"
    by (intro sum.cong) (auto simp: ae)
  then show "(∑(x, y)∈(λx. (X x, Y x)) ` space M.
    Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y))) = 0" by simp
qed

subsection ‹Entropy›

definition (in prob_space) entropy :: "real ==> 'b measure ==> ('a ==> 'b) ==> real" where
  "entropy b S X = - KL_divergence b S (distr M S X)"

abbreviation (in information_space)
  entropy_Pow (‹H'(_')›) where
  "H(X) ≡ entropy b (count_space (X`space M)) X"

lemma (in prob_space) distributed_RN_deriv:
  assumes X: "distributed M S X Px"
  shows "AE x in S. RN_deriv S (density S Px) x = Px x"
proof -
  have "distributed M S X (RN_deriv S (density S Px))"
    by (metis RN_derivI assms borel_measurable_RN_deriv distributed_def)
  then show ?thesis
    using assms distributed_unique by blast
qed

lemma (in information_space)
  fixes X :: "'a ==> 'b"
  assumes X[measurable]: "distributed M MX X f" and nn: "∧x. x ∈ space MX ==> 0 ≤ f x"
  shows entropy_distr: "entropy b MX X = - (∫x. f x * log b (f x) ∂MX)" (is ?eq)
proof -
  note D = distributed_measurable[OF X] distributed_borel_measurable[OF X]
  note ae = distributed_RN_deriv[OF X]
  note distributed_real_measurable[OF nn X, measurable]

  have ae_eq: "AE x in distr M MX X. log b (enn2real (RN_deriv MX (distr M MX X) x)) = log b (f x)"
    unfolding distributed_distr_eq_density[OF X]
    using D ae by (auto simp: AE_density)

  have int_eq: "(∫ x. f x * log b (f x) ∂MX) = (∫ x. log b (f x) ∂distr M MX X)"
    unfolding distributed_distr_eq_density[OF X]
    using D
    by (subst integral_density) (auto simp: nn)

  show ?eq
    unfolding entropy_def KL_divergence_def entropy_density_def comp_def int_eq neg_equal_iff_equal
    using ae_eq by (intro integral_cong_AE) (auto simp: nn)
qed

lemma (in information_space) entropy_le:
  fixes Px :: "'b ==> real" and MX :: "'b measure"
  assumes X[measurable]: "distributed M MX X Px" and Px_nn[simp]: "∧x. x ∈ space MX ==> 0 ≤ Px x"
  and fin: "emeasure MX {x ∈ space MX. Px x ≠ 0} ≠ top"
  and int: "integrable MX (λx. - Px x * log b (Px x))"
  shows "entropy b MX X ≤ log b (measure MX {x ∈ space MX. Px x ≠ 0})"
proof -
  note Px = distributed_borel_measurable[OF X]
  interpret X: prob_space "distr M MX X"
    using distributed_measurable[OF X] by (rule prob_space_distr)

  have " - log b (measure MX {x ∈ space MX. Px x ≠ 0}) =
    - log b (∫ x. indicator {x ∈ space MX. Px x ≠ 0} x ∂MX)"
    using Px Px_nn fin by (auto simp: measure_def)
  also have "- log b (∫ x. indicator {x ∈ space MX. Px x ≠ 0} x ∂MX) = - log b (∫ x. 1 / Px x ∂distr M MX X)"
  proof -
    have "integral🪙L MX (indicator {x ∈ space MX. Px x ≠ 0}) = LINT x|MX. Px x *🪙R (1 / Px x)"
      by (rule Bochner_Integration.integral_cong) auto
    also have "... = LINT x|density MX (λx. ennreal (Px x)). 1 / Px x"
      by (rule integral_density [symmetric]) (use Px Px_nn in auto)
    finally show ?thesis
      unfolding distributed_distr_eq_density[OF X] by simp
  qed
  also have "… ≤ (∫ x. - log b (1 / Px x) ∂distr M MX X)"
  proof (rule X.jensens_inequality[of "λx. 1 / Px x" "{0<..}" 0 1 "λx. - log b x"])
    show "AE x in distr M MX X. 1 / Px x ∈ {0<..}"
      unfolding distributed_distr_eq_density[OF X]
      using Px by (auto simp: AE_density)
    have [simp]: "∧x. x ∈ space MX ==> ennreal (if Px x = 0 then 0 else 1) = indicator {x ∈ space MX. Px x ≠ 0} x"
      by (auto simp: one_ennreal_def)
    have "(∫🪙+ x. ennreal (- (if Px x = 0 then 0 else 1)) ∂MX) = (∫🪙+ x. 0 ∂MX)"
      by (intro nn_integral_cong) (auto simp: ennreal_neg)
    then show "integrable (distr M MX X) (λx. 1 / Px x)"
      unfolding distributed_distr_eq_density[OF X] using Px
      by (auto simp: nn_integral_density real_integrable_def fin ennreal_neg ennreal_mult[symmetric]
              cong: nn_integral_cong)
    have "integrable MX (λx. Px x * log b (1 / Px x)) =
      integrable MX (λx. - Px x * log b (Px x))"
      using Px
      by (intro integrable_cong_AE) (auto simp: log_divide_pos log_recip)
    then show "integrable (distr M MX X) (λx. - log b (1 / Px x))"
      unfolding distributed_distr_eq_density[OF X]
      using Px int
      by (subst integrable_real_density) auto
  qed (auto simp: minus_log_convex[OF b_gt_1])
  also have "… = (∫ x. log b (Px x) ∂distr M MX X)"
    unfolding distributed_distr_eq_density[OF X] using Px
    by (intro integral_cong_AE) (auto simp: AE_density log_divide_pos)
  also have "… = - entropy b MX X"
    unfolding distributed_distr_eq_density[OF X] using Px
    by (subst entropy_distr[OF X]) (auto simp: integral_density)
  finally show ?thesis
    by simp
qed

lemma (in information_space) entropy_le_space:
  fixes Px :: "'b ==> real" and MX :: "'b measure"
  assumes X: "distributed M MX X Px" and Px_nn[simp]: "∧x. x ∈ space MX ==> 0 ≤ Px x"
  and fin: "finite_measure MX"
  and int: "integrable MX (λx. - Px x * log b (Px x))"
  shows "entropy b MX X ≤ log b (measure MX (space MX))"
proof -
  note Px = distributed_borel_measurable[OF X]
  interpret finite_measure MX by fact
  have "entropy b MX X ≤ log b (measure MX {x ∈ space MX. Px x ≠ 0})"
    using int X by (intro entropy_le) auto
  also have "… ≤ log b (measure MX (space MX))"
    using Px distributed_imp_emeasure_nonzero[OF X]
    by (intro Transcendental.log_mono)
       (auto intro!: finite_measure_mono b_gt_1 less_le[THEN iffD2]
             simp: emeasure_eq_measure cong: conj_cong)
  finally show ?thesis .
qed

lemma (in information_space) entropy_uniform:
  assumes X: "distributed M MX X (λx. indicator A x / measure MX A)" (is "distributed _ _ _ ?f")
  shows "entropy b MX X = log b (measure MX A)"
proof (subst entropy_distr[OF X])
  have [simp]: "emeasure MX A ≠ ∞"
    using uniform_distributed_params[OF X] by (auto simp add: measure_def)
  have eq: "(∫ x. indicator A x / measure MX A * log b (indicator A x / measure MX A) ∂MX) =
    (∫ x. (- log b (measure MX A) / measure MX A) * indicator A x ∂MX)"
    using uniform_distributed_params[OF X]
    by (intro Bochner_Integration.integral_cong) (auto split: split_indicator simp: log_divide_pos zero_less_measure_iff)
  show "- (∫ x. indicator A x / measure MX A * log b (indicator A x / measure MX A) ∂MX) =
    log b (measure MX A)"
    unfolding eq using uniform_distributed_params[OF X]
    by (subst Bochner_Integration.integral_mult_right) (auto simp: measure_def less_top[symmetric] intro!: integrable_real_indicator)
qed simp

lemma (in information_space) entropy_simple_distributed:
  "simple_distributed M X f ==> H(X) = - (∑x∈X`space M. f x * log b (f x))"
  by (subst entropy_distr[OF simple_distributed])
     (auto simp add: lebesgue_integral_count_space_finite)

lemma (in information_space) entropy_le_card_not_0:
  assumes X: "simple_distributed M X f"
  shows "H(X) ≤ log b (card (X ` space M ∩ {x. f x ≠ 0}))"
proof -
  let ?X = "count_space (X`space M)"
  have "H(X) ≤ log b (measure ?X {x ∈ space ?X. f x ≠ 0})"
    by (rule entropy_le[OF simple_distributed[OF X]])
       (insert X, auto simp: simple_distributed_finite[OF X] subset_eq integrable_count_space emeasure_count_space)
  also have "measure ?X {x ∈ space ?X. f x ≠ 0} = card (X ` space M ∩ {x. f x ≠ 0})"
    by (simp_all add: simple_distributed_finite[OF X] subset_eq emeasure_count_space measure_def Int_def)
  finally show ?thesis .
qed

lemma (in information_space) entropy_le_card:
  assumes X: "simple_distributed M X f"
  shows "H(X) ≤ log b (real (card (X ` space M)))"
proof -
  let ?X = "count_space (X`space M)"
  have "H(X) ≤ log b (measure ?X (space ?X))"
    by (rule entropy_le_space[OF simple_distributed[OF X]])
       (insert X, auto simp: simple_distributed_finite[OF X] subset_eq integrable_count_space emeasure_count_space finite_measure_count_space)
  also have "measure ?X (space ?X) = card (X ` space M)"
    by (simp_all add: simple_distributed_finite[OF X] subset_eq emeasure_count_space measure_def)
  finally show ?thesis .
qed

subsection ‹Conditional Mutual Information›

definition (in prob_space)
  "conditional_mutual_information b MX MY MZ X Y Z ≡
    mutual_information b MX (MY ⨂🪙M MZ) X (λx. (Y x, Z x)) -
    mutual_information b MX MZ X Z"

abbreviation (in information_space)
  conditional_mutual_information_Pow (‹I'( _ ; _ | _ ')›) where
  "I(X ; Y | Z) ≡ conditional_mutual_information b
    (count_space (X ` space M)) (count_space (Y ` space M)) (count_space (Z ` space M)) X Y Z"

lemma (in information_space)
  assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" and P: "sigma_finite_measure P"
  assumes Px[measurable]: "distributed M S X Px"
    and Px_nn[simp]: "∧x. x ∈ space S ==> 0 ≤ Px x"
  assumes Pz[measurable]: "distributed M P Z Pz"
    and Pz_nn[simp]: "∧z. z ∈ space P ==> 0 ≤ Pz z"
  assumes Pyz[measurable]: "distributed M (T ⨂🪙M P) (λx. (Y x, Z x)) Pyz"
    and Pyz_nn[simp]: "∧y z. y ∈ space T ==> z ∈ space P ==> 0 ≤ Pyz (y, z)"
  assumes Pxz[measurable]: "distributed M (S ⨂🪙M P) (λx. (X x, Z x)) Pxz"
    and Pxz_nn[simp]: "∧x z. x ∈ space S ==> z ∈ space P ==> 0 ≤ Pxz (x, z)"
  assumes Pxyz[measurable]: "distributed M (S ⨂🪙M T ⨂🪙M P) (λx. (X x, Y x, Z x)) Pxyz"
    and Pxyz_nn[simp]: "∧x y z. x ∈ space S ==> y ∈ space T ==> z ∈ space P ==> 0 ≤ Pxyz (x, y, z)"
  assumes I1: "integrable (S ⨂🪙M T ⨂🪙M P) (λ(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z))))"
  assumes I2: "integrable (S ⨂🪙M T ⨂🪙M P) (λ(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z)))"
  shows conditional_mutual_information_generic_eq: "conditional_mutual_information b S T P X Y Z
    = (∫(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))) ∂(S ⨂🪙M T ⨂🪙M P))" (is "?eq")
    and conditional_mutual_information_generic_nonneg: "0 ≤ conditional_mutual_information b S T P X Y Z" (is "?nonneg")
proof -
  have [measurable]: "Px ∈ S →🪙M borel"
    using Px Px_nn by (intro distributed_real_measurable)
  have [measurable]: "Pz ∈ P →🪙M borel"
    using Pz Pz_nn by (intro distributed_real_measurable)
  have measurable_Pyz[measurable]: "Pyz ∈ (T ⨂🪙M P) →🪙M borel"
    using Pyz Pyz_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure)
  have measurable_Pxz[measurable]: "Pxz ∈ (S ⨂🪙M P) →🪙M borel"
    using Pxz Pxz_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure)
  have measurable_Pxyz[measurable]: "Pxyz ∈ (S ⨂🪙M T ⨂🪙M P) →🪙M borel"
    using Pxyz Pxyz_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure)

  interpret S: sigma_finite_measure S by fact
  interpret T: sigma_finite_measure T by fact
  interpret P: sigma_finite_measure P by fact
  interpret TP: pair_sigma_finite T P ..
  interpret SP: pair_sigma_finite S P ..
  interpret ST: pair_sigma_finite S T ..
  interpret SPT: pair_sigma_finite "S ⨂🪙M P" T ..
  interpret STP: pair_sigma_finite S "T ⨂🪙M P" ..
  interpret TPS: pair_sigma_finite "T ⨂🪙M P" S ..
  have TP: "sigma_finite_measure (T ⨂🪙M P)" ..
  have SP: "sigma_finite_measure (S ⨂🪙M P)" ..
  have YZ: "random_variable (T ⨂🪙M P) (λx. (Y x, Z x))"
    using Pyz by (simp add: distributed_measurable)

  from Pxz Pxyz have distr_eq: "distr M (S ⨂🪙M P) (λx. (X x, Z x)) =
    distr (distr M (S ⨂🪙M T ⨂🪙M P) (λx. (X x, Y x, Z x))) (S ⨂🪙M P) (λ(x, y, z). (x, z))"
    by (simp add: comp_def distr_distr)

  have "mutual_information b S P X Z =
    (∫x. Pxz x * log b (Pxz x / (Px (fst x) * Pz (snd x))) ∂(S ⨂🪙M P))"
    by (rule mutual_information_distr[OF S P Px Px_nn Pz Pz_nn Pxz Pxz_nn])
  also have "… = (∫(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) ∂(S ⨂🪙M T ⨂🪙M P))"
    using b_gt_1 Pxz Px Pz
    by (subst distributed_transform_integral[OF Pxyz _ Pxz _, where T="λ(x, y, z). (x, z)"])
       (auto simp: split_beta' space_pair_measure)
  finally have mi_eq:
    "mutual_information b S P X Z = (∫(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) ∂(S ⨂🪙M T ⨂🪙M P))" .

  have ae1: "AE x in S ⨂🪙M T ⨂🪙M P. Px (fst x) = 0 ⟶ Pxyz x = 0"
    by (intro subdensity_real[of fst, OF _ Pxyz Px]) (auto simp: space_pair_measure)
  moreover have ae2: "AE x in S ⨂🪙M T ⨂🪙M P. Pz (snd (snd x)) = 0 ⟶ Pxyz x = 0"
    by (intro subdensity_real[of "λx. snd (snd x)", OF _ Pxyz Pz]) (auto simp: space_pair_measure)
  moreover have ae3: "AE x in S ⨂🪙M T ⨂🪙M P. Pxz (fst x, snd (snd x)) = 0 ⟶ Pxyz x = 0"
    by (intro subdensity_real[of "λx. (fst x, snd (snd x))", OF _ Pxyz Pxz]) (auto simp: space_pair_measure)
  moreover have ae4: "AE x in S ⨂🪙M T ⨂🪙M P. Pyz (snd x) = 0 ⟶ Pxyz x = 0"
    by (intro subdensity_real[of snd, OF _ Pxyz Pyz]) (auto simp: space_pair_measure)
  ultimately have ae: "AE x in S ⨂🪙M T ⨂🪙M P.
    Pxyz x * log b (Pxyz x / (Px (fst x) * Pyz (snd x))) -
    Pxyz x * log b (Pxz (fst x, snd (snd x)) / (Px (fst x) * Pz (snd (snd x)))) =
    Pxyz x * log b (Pxyz x * Pz (snd (snd x)) / (Pxz (fst x, snd (snd x)) * Pyz (snd x))) "
    using AE_space
  proof eventually_elim
    case (elim x)
    show ?case
    proof cases
      assume "Pxyz x ≠ 0"
      with elim have "0 < Px (fst x)" "0 < Pz (snd (snd x))" "0 < Pxz (fst x, snd (snd x))"
        "0 < Pyz (snd x)" "0 < Pxyz x"
        by (auto simp: space_pair_measure less_le)
      then show ?thesis
        using b_gt_1 by (simp add: log_simps less_imp_le field_simps)
    qed simp
  qed
  with I1 I2 show ?eq
    unfolding conditional_mutual_information_def
    apply (subst mi_eq)
    apply (subst mutual_information_distr[OF S TP Px Px_nn Pyz _ Pxyz])
    apply (auto simp: space_pair_measure)
    apply (subst Bochner_Integration.integral_diff[symmetric])
    apply (auto intro!: integral_cong_AE simp: split_beta' simp del: Bochner_Integration.integral_diff)
    done

  let ?P = "density (S ⨂🪙M T ⨂🪙M P) Pxyz"
  interpret P: prob_space ?P
    unfolding distributed_distr_eq_density[OF Pxyz, symmetric]
    by (rule prob_space_distr) simp

  let ?Q = "density (T ⨂🪙M P) Pyz"
  interpret Q: prob_space ?Q
    unfolding distributed_distr_eq_density[OF Pyz, symmetric]
    by (rule prob_space_distr) simp

  let ?f = "λ(x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) / Pxyz (x, y, z)"

  from subdensity_real[of snd, OF _ Pyz Pz _ AE_I2 AE_I2]
  have aeX1: "AE x in T ⨂🪙M P. Pz (snd x) = 0 ⟶ Pyz x = 0"
    by (auto simp: comp_def space_pair_measure)
  have aeX2: "AE x in T ⨂🪙M P. 0 ≤ Pz (snd x)"
    using Pz by (intro TP.AE_pair_measure) (auto simp: comp_def)

  have aeX3: "AE y in T ⨂🪙M P. (∫🪙+ x. ennreal (Pxz (x, snd y)) ∂S) = ennreal (Pz (snd y))"
    using Pz distributed_marginal_eq_joint2[OF P S Pz Pxz]
    by (intro TP.AE_pair_measure) auto

  have "(∫🪙+ x. ?f x ∂?P) ≤ (∫🪙+ (x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) ∂(S ⨂??M T ⨂🪙M P))"
    by (subst nn_integral_density)
       (auto intro!: nn_integral_mono simp: space_pair_measure ennreal_mult[symmetric])
  also have "… = (∫🪙+(y, z). (∫🪙+ x. ennreal (Pxz (x, z)) * ennreal (Pyz (y, z) / Pz z) ∂S) ∂(T ⨂🪙M P))"
    by (subst STP.nn_integral_snd[symmetric])
       (auto simp add: split_beta' ennreal_mult[symmetric] space_pair_measure intro!: nn_integral_cong)
  also have "… = (∫🪙+x. ennreal (Pyz x) * 1 ∂T ⨂🪙M P)"
  proof -
    have D: "(∫🪙+ x. ennreal (Pxz (x, b)) * ennreal (Pyz (a, b) / Pz b) ∂S) = ennreal (Pyz (a, b))"
      if "Pz b = 0 ⟶ Pyz (a, b) = 0" "a ∈ space T ∧ b ∈ space P"
        "(∫🪙+ x. ennreal (Pxz (x, b)) ∂S) = ennreal (Pz b)"
      for a b
      using that by (subst nn_integral_multc) (auto split: prod.split simp: ennreal_mult[symmetric])
    show ?thesis
      apply (rule nn_integral_cong_AE)
      using aeX1 aeX3
      by (force simp add: space_pair_measure D)
  qed
  also have "… = 1"
    using Q.emeasure_space_1 distributed_distr_eq_density[OF Pyz]
    by (subst nn_integral_density[symmetric]) auto
  finally have le1: "(∫🪙+ x. ?f x ∂?P) ≤ 1" .
  also have "… < ∞" by simp
  finally have fin: "(∫🪙+ x. ?f x ∂?P) ≠ ∞" by simp

  have "(∫🪙+ x. ennreal (Pxyz x) *
               ennreal (Pxz (fst x, snd (snd x)) * Pyz (snd x) / (Pz (snd (snd x)) * Pxyz x))
               ∂(S ⨂🪙M T ⨂🪙M P)) ≠ 0"
  proof
    let ?g = "λx. ennreal (Pxyz x) * (Pxz (fst x, snd (snd x)) * Pyz (snd x) / (Pz (snd (snd x)) * Pxyz x))"
    assume "(∫🪙+x. ?g x ∂(S ⨂🪙M T ⨂🪙M P)) = 0"
    then have "AE x in S ⨂🪙M T ⨂🪙M P. ?g x = 0"
      by (intro nn_integral_0_iff_AE[THEN iffD1]) auto
    then have "AE x in S ⨂🪙M T ⨂🪙M P. Pxyz x = 0"
      using ae2 ae3 ae4
      by (auto split: if_split_asm simp: mult_le_0_iff divide_le_0_iff space_pair_measure)
    then have "(∫🪙+ x. ennreal (Pxyz x) ∂S ⨂🪙M T ⨂🪙M P) = 0"
      by (subst nn_integral_cong_AE[of _ "λx. 0"]) auto
    with P.emeasure_space_1 show False
      by (subst (asm) emeasure_density) (auto cong: nn_integral_cong)
  qed
  then have pos: "(∫🪙+x. ?f x ∂?P) ≠ 0"
    by (subst nn_integral_density) (simp_all add: split_beta')

  have neg: "(∫🪙+ x. - ?f x ∂?P) = 0"
    by (subst nn_integral_0_iff_AE) (auto simp: space_pair_measure ennreal_neg)

  have I3: "integrable (S ⨂🪙M T ⨂🪙M P) (λ(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))"
    apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ Bochner_Integration.integrable_diff[OF I1 I2]])
    using ae
    apply (auto simp: split_beta')
    done

  have "- log b 1 ≤ - log b (integral🪙L ?P ?f)"
  proof (intro le_imp_neg_le Transcendental.log_mono[OF b_gt_1])
    have If: "integrable ?P ?f"
      unfolding real_integrable_def
    proof (intro conjI)
      from neg show "(∫🪙+ x. - ?f x ∂?P) ≠ ∞"
        by simp
      from fin show "(∫🪙+ x. ?f x ∂?P) ≠ ∞"
        by simp
    qed simp
    then have "(∫🪙+ x. ?f x ∂?P) = (∫x. ?f x ∂?P)"
      apply (rule nn_integral_eq_integral)
      apply (auto simp: space_pair_measure ennreal_neg)
      done
    with pos le1
    show "0 < (∫x. ?f x ∂?P)" "(∫x. ?f x ∂?P) ≤ 1"
      by (simp_all add: one_ennreal.rep_eq zero_less_iff_neq_zero[symmetric])
  qed
  also have "- log b (integral🪙L ?P ?f) ≤ (∫ x. - log b (?f x) ∂?P)"
  proof (rule P.jensens_inequality[where a=0 and b=1 and I="{0<..}"])
    show "AE x in ?P. ?f x ∈ {0<..}"
      unfolding AE_density[OF distributed_borel_measurable[OF Pxyz]]
      using ae1 ae2 ae3 ae4
      by (auto simp: space_pair_measure less_le)
    show "integrable ?P ?f"
      unfolding real_integrable_def
      using fin neg by (auto simp: split_beta')
    have "integrable (S ⨂🪙M T ⨂🪙M P) (λx. Pxyz x * - log b (?f x))"
      apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ I3])
      by (auto simp: log_mult log_divide field_simps)
    then
    show "integrable ?P (λx. - log b (?f x))"
      by (subst integrable_real_density) (auto simp: space_pair_measure)
  qed (auto simp: b_gt_1 minus_log_convex)
  also have "… = conditional_mutual_information b S T P X Y Z"
    unfolding ‹?eq›
    apply (subst integral_real_density)
       apply simp
      apply (force simp: space_pair_measure)
     apply simp
    apply (intro integral_cong_AE)
    by (auto simp: field_simps log_divide)
  finally show ?nonneg
    by simp
qed

lemma (in information_space)
  fixes Px :: "_ ==> real"
  assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" and P: "sigma_finite_measure P"
  assumes Fx: "finite_entropy S X Px"
  assumes Fz: "finite_entropy P Z Pz"
  assumes Fyz: "finite_entropy (T ⨂🪙M P) (λx. (Y x, Z x)) Pyz"
  assumes Fxz: "finite_entropy (S ⨂🪙M P) (λx. (X x, Z x)) Pxz"
  assumes Fxyz: "finite_entropy (S ⨂🪙M T ⨂🪙M P) (λx. (X x, Y x, Z x)) Pxyz"
  shows conditional_mutual_information_generic_eq': "conditional_mutual_information b S T P X Y Z
    = (∫(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))) ∂(S ⨂🪙M T ⨂🪙M P))" (is "?eq")
    and conditional_mutual_information_generic_nonneg': "0 ≤ conditional_mutual_information b S T P X Y Z" (is "?nonneg")
proof -
  note Px = Fx[THEN finite_entropy_distributed, measurable]
  note Pz = Fz[THEN finite_entropy_distributed, measurable]
  note Pyz = Fyz[THEN finite_entropy_distributed, measurable]
  note Pxz = Fxz[THEN finite_entropy_distributed, measurable]
  note Pxyz = Fxyz[THEN finite_entropy_distributed, measurable]

  note Px_nn = Fx[THEN finite_entropy_nn]
  note Pz_nn = Fz[THEN finite_entropy_nn]
  note Pyz_nn = Fyz[THEN finite_entropy_nn]
  note Pxz_nn = Fxz[THEN finite_entropy_nn]
  note Pxyz_nn = Fxyz[THEN finite_entropy_nn]

  note Px' = Fx[THEN finite_entropy_measurable, measurable]
  note Pz' = Fz[THEN finite_entropy_measurable, measurable]
  note Pyz' = Fyz[THEN finite_entropy_measurable, measurable]
  note Pxz' = Fxz[THEN finite_entropy_measurable, measurable]
  note Pxyz' = Fxyz[THEN finite_entropy_measurable, measurable]

  interpret S: sigma_finite_measure S by fact
  interpret T: sigma_finite_measure T by fact
  interpret P: sigma_finite_measure P by fact
  interpret TP: pair_sigma_finite T P ..
  interpret SP: pair_sigma_finite S P ..
  interpret ST: pair_sigma_finite S T ..
  interpret SPT: pair_sigma_finite "S ⨂🪙M P" T ..
  interpret STP: pair_sigma_finite S "T ⨂🪙M P" ..
  interpret TPS: pair_sigma_finite "T ⨂🪙M P" S ..
  have TP: "sigma_finite_measure (T ⨂🪙M P)" ..
  have SP: "sigma_finite_measure (S ⨂🪙M P)" ..

  from Pxz Pxyz have distr_eq: "distr M (S ⨂🪙M P) (λx. (X x, Z x)) =
    distr (distr M (S ⨂🪙M T ⨂🪙M P) (λx. (X x, Y x, Z x))) (S ⨂🪙M P) (λ(x, y, z). (x, z))"
    by (simp add: distr_distr comp_def)

  have "mutual_information b S P X Z =
    (∫x. Pxz x * log b (Pxz x / (Px (fst x) * Pz (snd x))) ∂(S ⨂🪙M P))"
    using Px Px_nn Pz Pz_nn Pxz Pxz_nn
    by (rule mutual_information_distr[OF S P]) (auto simp: space_pair_measure)
  also have "… = (∫(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) ∂(S ⨂🪙M T ⨂🪙M P))"
    using b_gt_1 Pxz Pxz_nn Pxyz Pxyz_nn
    by (subst distributed_transform_integral[OF Pxyz _ Pxz, where T="λ(x, y, z). (x, z)"])
       (auto simp: split_beta')
  finally have mi_eq:
    "mutual_information b S P X Z = (∫(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) ∂(S ⨂🪙M T ⨂🪙M P))" .

  have ae1: "AE x in S ⨂🪙M T ⨂🪙M P. Px (fst x) = 0 ⟶ Pxyz x = 0"
    by (intro subdensity_finite_entropy[of fst, OF _ Fxyz Fx]) auto
  moreover have ae2: "AE x in S ⨂🪙M T ⨂🪙M P. Pz (snd (snd x)) = 0 ⟶ Pxyz x = 0"
    by (intro subdensity_finite_entropy[of "λx. snd (snd x)", OF _ Fxyz Fz]) auto
  moreover have ae3: "AE x in S ⨂🪙M T ⨂🪙M P. Pxz (fst x, snd (snd x)) = 0 ⟶ Pxyz x = 0"
    by (intro subdensity_finite_entropy[of "λx. (fst x, snd (snd x))", OF _ Fxyz Fxz]) auto
  moreover have ae4: "AE x in S ⨂🪙M T ⨂🪙M P. Pyz (snd x) = 0 ⟶ Pxyz x = 0"
    by (intro subdensity_finite_entropy[of snd, OF _ Fxyz Fyz]) auto
  ultimately have ae: "AE x in S ⨂🪙M T ⨂🪙M P.
    Pxyz x * log b (Pxyz x / (Px (fst x) * Pyz (snd x))) -
    Pxyz x * log b (Pxz (fst x, snd (snd x)) / (Px (fst x) * Pz (snd (snd x)))) =
    Pxyz x * log b (Pxyz x * Pz (snd (snd x)) / (Pxz (fst x, snd (snd x)) * Pyz (snd x))) "
    using AE_space
  proof eventually_elim
    case (elim x)
    show ?case
    proof cases
      assume "Pxyz x ≠ 0"
      with elim have "0 < Px (fst x)" "0 < Pz (snd (snd x))" "0 < Pxz (fst x, snd (snd x))"
        "0 < Pyz (snd x)" "0 < Pxyz x"
        using Px_nn Pz_nn Pxz_nn Pyz_nn Pxyz_nn
        by (auto simp: space_pair_measure less_le)
      then show ?thesis
        using b_gt_1 by (simp add: log_simps less_imp_le field_simps)
    qed simp
  qed

  have "integrable (S ⨂🪙M T ⨂🪙M P)
    (λx. Pxyz x * log b (Pxyz x) - Pxyz x * log b (Px (fst x)) - Pxyz x * log b (Pyz (snd x)))"
    using finite_entropy_integrable[OF Fxyz]
    using finite_entropy_integrable_transform[OF Fx Pxyz Pxyz_nn, of fst]
    using finite_entropy_integrable_transform[OF Fyz Pxyz Pxyz_nn, of snd]
    by simp
  moreover have "(λ(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z)))) ∈ borel_measurable (S ⨂🪙M T ⨂🪙M P)"
    using Pxyz Px Pyz by simp
  ultimately have I1: "integrable (S ⨂🪙M T ⨂🪙M P) (λ(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z))))"
    apply (rule integrable_cong_AE_imp)
    using ae1 ae4 Px_nn Pyz_nn Pxyz_nn
    by (auto simp: log_divide log_mult field_simps)
  have "integrable (S ⨂🪙M T ⨂🪙M P)
    (λx. Pxyz x * log b (Pxz (fst x, snd (snd x))) - Pxyz x * log b (Px (fst x)) - Pxyz x * log b (Pz (snd (snd x))))"
    using finite_entropy_integrable_transform[OF Fxz Pxyz Pxyz_nn, of "λx. (fst x, snd (snd x))"]
    using finite_entropy_integrable_transform[OF Fx Pxyz Pxyz_nn, of fst]
    using finite_entropy_integrable_transform[OF Fz Pxyz Pxyz_nn, of "snd ∘ snd"]
    by simp
  moreover have "(λ(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z))) ∈ borel_measurable (S ⨂🪙M T ⨂🪙M P)"
    using Pxyz Px Pz
    by auto
  ultimately have I2: "integrable (S ⨂🪙M T ⨂🪙M P) (λ(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z)))"
    apply (rule integrable_cong_AE_imp)
    using ae1 ae2 ae3 ae4  Px_nn Pz_nn Pxz_nn Pyz_nn Pxyz_nn
    apply(auto simp:  field_simps log_divide log_mult)
    done
  from ae I1 I2 show ?eq
    unfolding conditional_mutual_information_def mi_eq
    apply (subst mutual_information_distr[OF S TP Px Px_nn Pyz Pyz_nn Pxyz Pxyz_nn]; simp add: space_pair_measure)
    apply (subst Bochner_Integration.integral_diff[symmetric])
      apply (auto intro!: integral_cong_AE simp: split_beta' simp del: Bochner_Integration.integral_diff)
    done

  let ?P = "density (S ⨂🪙M T ⨂🪙M P) Pxyz"
  interpret P: prob_space ?P
    unfolding distributed_distr_eq_density[OF Pxyz, symmetric] by (rule prob_space_distr) simp

  let ?Q = "density (T ⨂🪙M P) Pyz"
  interpret Q: prob_space ?Q
    unfolding distributed_distr_eq_density[OF Pyz, symmetric] by (rule prob_space_distr) simp

  let ?f = "λ(x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) / Pxyz (x, y, z)"

  from subdensity_finite_entropy[of snd, OF _ Fyz Fz]
  have aeX1: "AE x in T ⨂🪙M P. Pz (snd x) = 0 ⟶ Pyz x = 0" by (auto simp: comp_def)
  have aeX2: "AE x in T ⨂🪙M P. 0 ≤ Pz (snd x)"
    using Pz by (intro TP.AE_pair_measure) (auto intro: Pz_nn)

  have aeX3: "AE y in T ⨂🪙M P. (∫🪙+ x. ennreal (Pxz (x, snd y)) ∂S) = ennreal (Pz (snd y))"
    using Pz distributed_marginal_eq_joint2[OF P S Pz Pxz]
    by (intro TP.AE_pair_measure) (auto )
  have "(∫🪙+ x. ?f x ∂?P) ≤ (∫🪙+ (x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) ∂(S ⨂??M T ⨂🪙M P))"
    using Px_nn Pz_nn Pxz_nn Pyz_nn Pxyz_nn
    by (subst nn_integral_density)
       (auto intro!: nn_integral_mono simp: space_pair_measure ennreal_mult[symmetric])
  also have "… = (∫🪙+(y, z). ∫🪙+ x. ennreal (Pxz (x, z)) * ennreal (Pyz (y, z) / Pz z) ∂S ∂T ⨂🪙M P)"
    using Px_nn Pz_nn Pxz_nn Pyz_nn Pxyz_nn
    by (subst STP.nn_integral_snd[symmetric])
       (auto simp add: split_beta' ennreal_mult[symmetric] space_pair_measure intro!: nn_integral_cong)
  also have "… = (∫🪙+x. ennreal (Pyz x) * 1 ∂T ⨂🪙M P)"
  proof -
    have *: "(∫🪙+ x. ennreal (Pxz (x, b)) * ennreal (Pyz (a, b) / Pz b) ∂S) = ennreal (Pyz (a, b))"
      if "Pz b = 0 ⟶ Pyz (a, b) = 0" "0 ≤ Pz b" "a ∈ space T ∧ b ∈ space P"
        "(∫🪙+ x. ennreal (Pxz (x, b)) ∂S) = ennreal (Pz b)" for a b
      using Pyz_nn[of "(a,b)"] that
      by (subst nn_integral_multc) (auto simp: space_pair_measure ennreal_mult[symmetric])
    show ?thesis
      using aeX1 aeX2 aeX3 AE_space
      by (force simp: * space_pair_measure intro: nn_integral_cong_AE)
  qed
  also have "… = 1"
    using Q.emeasure_space_1 Pyz_nn distributed_distr_eq_density[OF Pyz]
    by (subst nn_integral_density[symmetric]) auto
  finally have le1: "(∫🪙+ x. ?f x ∂?P) ≤ 1" .
  also have "… < ∞" by simp
  finally have fin: "(∫🪙+ x. ?f x ∂?P) ≠ ∞" by simp

  have "(∫🪙+ x. ?f x ∂?P) ≠ 0"
    using Pxyz_nn
    apply (subst nn_integral_density)
    apply (simp_all add: split_beta'  ennreal_mult'[symmetric] cong: nn_integral_cong)
  proof
    let ?g = "λx. ennreal (if Pxyz x = 0 then 0 else Pxz (fst x, snd (snd x)) * Pyz (snd x) / Pz (snd (snd x)))"
    assume "(∫🪙+ x. ?g x ∂(S ⨂🪙M T ⨂🪙M P)) = 0"
    then have "AE x in S ⨂🪙M T ⨂🪙M P. ?g x = 0"
      by (intro nn_integral_0_iff_AE[THEN iffD1]) auto
    then have "AE x in S ⨂🪙M T ⨂🪙M P. Pxyz x = 0"
      using ae1 ae2 ae3 ae4
      by  (insert Px_nn Pz_nn Pxz_nn Pyz_nn,
           auto split: if_split_asm simp: mult_le_0_iff divide_le_0_iff space_pair_measure)
    then have "(∫🪙+ x. ennreal (Pxyz x) ∂S ⨂🪙M T ⨂🪙M P) = 0"
      by (subst nn_integral_cong_AE[of _ "λx. 0"]) auto
    with P.emeasure_space_1 show False
      by (subst (asm) emeasure_density) (auto cong: nn_integral_cong)
  qed
  then have pos: "0 < (∫🪙+ x. ?f x ∂?P)"
    by (simp add: zero_less_iff_neq_zero)

  have neg: "(∫🪙+ x. - ?f x ∂?P) = 0"
    using Pz_nn Pxz_nn Pyz_nn Pxyz_nn
    by (intro nn_integral_0_iff_AE[THEN iffD2])
       (auto simp: split_beta' AE_density space_pair_measure intro!: AE_I2 ennreal_neg)

  have I3: "integrable (S ⨂🪙M T ⨂🪙M P) (λ(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))"
    apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ Bochner_Integration.integrable_diff[OF I1 I2]])
    using ae
    by (auto simp: split_beta')

  have "- log b 1 ≤ - log b (integral🪙L ?P ?f)"
  proof (intro le_imp_neg_le Transcendental.log_mono[OF b_gt_1])
    have If: "integrable ?P ?f"
      using neg fin by (force simp add: real_integrable_def)
    then have "(∫🪙+ x. ?f x ∂?P) = (∫x. ?f x ∂?P)"
      using Pz_nn Pxz_nn Pyz_nn Pxyz_nn
      by (intro nn_integral_eq_integral)
         (auto simp: AE_density space_pair_measure)
    with pos le1
    show "0 < (∫x. ?f x ∂?P)" "(∫x. ?f x ∂?P) ≤ 1"
      by (simp_all add: )
  qed
  also have "- log b (integral🪙L ?P ?f) ≤ (∫ x. - log b (?f x) ∂?P)"
  proof (rule P.jensens_inequality[where a=0 and b=1 and I="{0<..}"])
    show "AE x in ?P. ?f x ∈ {0<..}"
      unfolding AE_density[OF distributed_borel_measurable[OF Pxyz]]
      using ae1 ae2 ae3 ae4
      by (insert Pxyz_nn Pyz_nn Pz_nn Pxz_nn, auto simp: space_pair_measure less_le)
    show "integrable ?P ?f"
      unfolding real_integrable_def
      using fin neg by (auto simp: split_beta')
    have "integrable (S ⨂🪙M T ⨂🪙M P) (λx. Pxyz x * - log b (?f x))"
      apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ I3])
      using Pz_nn Pxz_nn Pyz_nn Pxyz_nn ae2 ae3 ae4
      by (auto simp: log_divide field_simps)
    then show "integrable ?P (λx. - log b (?f x))"
      using Pxyz_nn by (auto simp: integrable_real_density)
  qed (auto simp: b_gt_1 minus_log_convex)
  also have "… = conditional_mutual_information b S T P X Y Z"
    unfolding ‹?eq›
    using Pz_nn Pxz_nn Pyz_nn Pxyz_nn
    apply (subst integral_real_density)
    apply simp
    apply simp
    apply simp
    apply (intro integral_cong_AE)
    using ae1 ae2 ae3 ae4
    by (auto simp: log_divide zero_less_mult_iff field_simps)
  finally show ?nonneg
    by simp
qed

lemma (in information_space) conditional_mutual_information_eq:
  assumes Pz: "simple_distributed M Z Pz"
  assumes Pyz: "simple_distributed M (λx. (Y x, Z x)) Pyz"
  assumes Pxz: "simple_distributed M (λx. (X x, Z x)) Pxz"
  assumes Pxyz: "simple_distributed M (λx. (X x, Y x, Z x)) Pxyz"
  shows "I(X ; Y | Z) =
   (∑(x, y, z)∈(λx. (X x, Y x, Z x))`space M. Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))"
proof (subst conditional_mutual_information_generic_eq[OF _ _ _ _ _
    simple_distributed[OF Pz] _ simple_distributed_joint[OF Pyz] _ simple_distributed_joint[OF Pxz] _
    simple_distributed_joint2[OF Pxyz]])
  note simple_distributed_joint2_finite[OF Pxyz, simp]
  show "sigma_finite_measure (count_space (X ` space M))"
    by (simp add: sigma_finite_measure_count_space_finite)
  show "sigma_finite_measure (count_space (Y ` space M))"
    by (simp add: sigma_finite_measure_count_space_finite)
  show "sigma_finite_measure (count_space (Z ` space M))"
    by (simp add: sigma_finite_measure_count_space_finite)
  have "count_space (X ` space M) ⨂🪙M count_space (Y ` space M) ⨂🪙M count_space (Z ` space M) =
      count_space (X`space M × Y`space M × Z`space M)"
    (is "?P = ?C")
    by (simp add: pair_measure_count_space)

  let ?Px = "λx. measure M (X -` {x} ∩ space M)"
  have "(λx. (X x, Z x)) ∈ measurable M (count_space (X ` space M) ⨂🪙M count_space (Z ` space M))"
    using simple_distributed_joint[OF Pxz] by (rule distributed_measurable)
  from measurable_comp[OF this measurable_fst]
  have "random_variable (count_space (X ` space M)) X"
    by (simp add: comp_def)
  then have "simple_function M X"
    unfolding simple_function_def by (auto simp: measurable_count_space_eq2)
  then have "simple_distributed M X ?Px"
    by (rule simple_distributedI) (auto simp: measure_nonneg)
  then show "distributed M (count_space (X ` space M)) X ?Px"
    by (rule simple_distributed)

  let ?f = "(λx. if x ∈ (λx. (X x, Y x, Z x)) ` space M then Pxyz x else 0)"
  let ?g = "(λx. if x ∈ (λx. (Y x, Z x)) ` space M then Pyz x else 0)"
  let ?h = "(λx. if x ∈ (λx. (X x, Z x)) ` space M then Pxz x else 0)"
  show
      "integrable ?P (λ(x, y, z). ?f (x, y, z) * log b (?f (x, y, z) / (?Px x * ?g (y, z))))"
      "integrable ?P (λ(x, y, z). ?f (x, y, z) * log b (?h (x, z) / (?Px x * Pz z)))"
    by (auto intro!: integrable_count_space simp: pair_measure_count_space)
  let ?i = "λx y z. ?f (x, y, z) * log b (?f (x, y, z) / (?h (x, z) * (?g (y, z) / Pz z)))"
  let ?j = "λx y z. Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z)))"
  have "(λ(x, y, z). ?i x y z) = (λx. if x ∈ (λx. (X x, Y x, Z x)) ` space M then ?j (fst x) (fst (snd x)) (snd (snd x)) else 0)"
    by (auto intro!: ext)
  then show "(∫ (x, y, z). ?i x y z ∂?P) = (∑(x, y, z)∈(λx. (X x, Y x, Z x)) ` space M. ?j x y z)"
    by (auto intro!: sum.cong simp add: ‹?P = ?C› lebesgue_integral_count_space_finite simple_distributed_finite sum.If_cases split_beta')
qed (insert Pz Pyz Pxz Pxyz, auto intro: measure_nonneg)

lemma (in information_space) conditional_mutual_information_nonneg:
  assumes X: "simple_function M X" and Y: "simple_function M Y" and Z: "simple_function M Z"
  shows "0 ≤ I(X ; Y | Z)"
proof -
  have [simp]: "count_space (X ` space M) ⨂🪙M count_space (Y ` space M) ⨂🪙M count_space (Z ` space M) =
      count_space (X`space M × Y`space M × Z`space M)"
    by (simp add: pair_measure_count_space X Y Z simple_functionD)
  note sf = sigma_finite_measure_count_space_finite[OF simple_functionD(1)]
  note sd = simple_distributedI[OF _ _ refl]
  note sp = simple_function_Pair
  show ?thesis
    apply (rule conditional_mutual_information_generic_nonneg[OF sf[OF X] sf[OF Y] sf[OF Z]])
               apply (force intro: simple_distributed[OF sd[OF X]])
              apply simp
             apply (force intro: simple_distributed[OF sd[OF Z]])
            apply simp
           apply (force intro: simple_distributed_joint[OF sd[OF sp[OF Y Z]]])
          apply simp
         apply (force intro: simple_distributed_joint[OF sd[OF sp[OF X Z]]])
        apply simp
       apply (fastforce intro:  simple_distributed_joint2[OF sd[OF sp[OF X sp[OF Y Z]]]])
      apply (auto intro!: integrable_count_space simp: X Y Z simple_functionD)
    done
qed

subsection ‹Conditional Entropy›

definition (in prob_space)
  "conditional_entropy b S T X Y = - (∫(x, y). log b (enn2real (RN_deriv (S ⨂🪙M T) (distr M (S ⨂🪙M T) (λx. (X x, Y x))) (x, y)) /
    enn2real (RN_deriv T (distr M T Y) y)) ∂distr M (S ⨂🪙M T) (λx. (X x, Y x)))"

abbreviation (in information_space)
  conditional_entropy_Pow (‹H'(_ | _')›) where
  "H(X | Y) ≡ conditional_entropy b (count_space (X`space M)) (count_space (Y`space M)) X Y"

lemma (in information_space) conditional_entropy_generic_eq:
  fixes Pxy :: "_ ==> real" and Py :: "'c ==> real"
  assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
  assumes Py[measurable]: "distributed M T Y Py" and Py_nn[simp]: "∧x. x ∈ space T ==> 0 ≤ Py x"
  assumes Pxy[measurable]: "distributed M (S ⨂🪙M T) (λx. (X x, Y x)) Pxy"
    and Pxy_nn[simp]: "∧x y. x ∈ space S ==> y ∈ space T ==> 0 ≤ Pxy (x, y)"
  shows "conditional_entropy b S T X Y = - (∫(x, y). Pxy (x, y) * log b (Pxy (x, y) / Py y) ∂(S ⨂🪙M T))"
proof -
  interpret S: sigma_finite_measure S by fact
  interpret T: sigma_finite_measure T by fact
  interpret ST: pair_sigma_finite S T ..

  have [measurable]: "Py ∈ T →🪙M borel"
    using Py Py_nn by (intro distributed_real_measurable)
  have measurable_Pxy[measurable]: "Pxy ∈ (S ⨂🪙M T) →🪙M borel"
    using Pxy Pxy_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure)

  have "AE x in density (S ⨂🪙M T) (λx. ennreal (Pxy x)). Pxy x = enn2real (RN_deriv (S ⨂🪙M T) (distr M (S ⨂🪙M T) (λx. (X x, Y x))) x)"
    unfolding AE_density[OF distributed_borel_measurable, OF Pxy]
    unfolding distributed_distr_eq_density[OF Pxy]
    using distributed_RN_deriv[OF Pxy]
    by auto
  moreover
  have "AE x in density (S ⨂🪙M T) (λx. ennreal (Pxy x)). Py (snd x) = enn2real (RN_deriv T (distr M T Y) (snd x))"
    unfolding AE_density[OF distributed_borel_measurable, OF Pxy]
    unfolding distributed_distr_eq_density[OF Py]
    using distributed_RN_deriv[OF Py]
    by (force intro: ST.AE_pair_measure)
  ultimately
  have "conditional_entropy b S T X Y = - (∫x. Pxy x * log b (Pxy x / Py (snd x)) ∂(S ⨂🪙M T))"
    unfolding conditional_entropy_def neg_equal_iff_equal
    apply (subst integral_real_density[symmetric])
    apply (auto simp: distributed_distr_eq_density[OF Pxy] space_pair_measure
                intro!: integral_cong_AE)
    done
  then show ?thesis by (simp add: split_beta')
qed

lemma (in information_space) conditional_entropy_eq_entropy:
  fixes Px :: "'b ==> real" and Py :: "'c ==> real"
  assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
  assumes Py[measurable]: "distributed M T Y Py"
    and Py_nn[simp]: "∧x. x ∈ space T ==> 0 ≤ Py x"
  assumes Pxy[measurable]: "distributed M (S ⨂🪙M T) (λx. (X x, Y x)) Pxy"
    and Pxy_nn[simp]: "∧x y. x ∈ space S ==> y ∈ space T ==> 0 ≤ Pxy (x, y)"
  assumes I1: "integrable (S ⨂🪙M T) (λx. Pxy x * log b (Pxy x))"
  assumes I2: "integrable (S ⨂🪙M T) (λx. Pxy x * log b (Py (snd x)))"
  shows "conditional_entropy b S T X Y = entropy b (S ⨂🪙M T) (λx. (X x, Y x)) - entropy b T Y"
proof -
  interpret S: sigma_finite_measure S by fact
  interpret T: sigma_finite_measure T by fact
  interpret ST: pair_sigma_finite S T ..

  have [measurable]: "Py ∈ T →🪙M borel"
    using Py Py_nn by (intro distributed_real_measurable)
  have measurable_Pxy[measurable]: "Pxy ∈ (S ⨂🪙M T) →🪙M borel"
    using Pxy Pxy_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure)

  have "entropy b T Y = - (∫y. Py y * log b (Py y) ∂T)"
    by (rule entropy_distr[OF Py Py_nn])
  also have "… = - (∫(x,y). Pxy (x,y) * log b (Py y) ∂(S ⨂🪙M T))"
    using b_gt_1
    by (subst distributed_transform_integral[OF Pxy _ Py, where T=snd])
       (auto intro!: Bochner_Integration.integral_cong simp: space_pair_measure)
  finally have e_eq: "entropy b T Y = - (∫(x,y). Pxy (x,y) * log b (Py y) ∂(S ⨂🪙M T))" .

  have **: "∧x. x ∈ space (S ⨂🪙M T) ==> 0 ≤ Pxy x"
    by (auto simp: space_pair_measure)

  have ae2: "AE x in S ⨂🪙M T. Py (snd x) = 0 ⟶ Pxy x = 0"
    by (intro subdensity_real[of snd, OF _ Pxy Py])
       (auto intro: measurable_Pair simp: space_pair_measure)
  moreover have ae4: "AE x in S ⨂🪙M T. 0 ≤ Py (snd x)"
    using Py by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_snd'')
  ultimately have "AE x in S ⨂🪙M T. 0 ≤ Pxy x ∧ 0 ≤ Py (snd x) ∧
    (Pxy x = 0 ∨ (Pxy x ≠ 0 ⟶ 0 < Pxy x ∧ 0 < Py (snd x)))"
    by (auto simp: space_pair_measure less_le)
  then have ae: "AE x in S ⨂🪙M T.
     Pxy x * log b (Pxy x) - Pxy x * log b (Py (snd x)) = Pxy x * log b (Pxy x / Py (snd x))"
    by (auto simp: log_simps field_simps b_gt_1)
  have "conditional_entropy b S T X Y =
    - (∫x. Pxy x * log b (Pxy x) - Pxy x * log b (Py (snd x)) ∂(S ⨂🪙M T))"
    unfolding conditional_entropy_generic_eq[OF S T Py Py_nn Pxy Pxy_nn, simplified] neg_equal_iff_equal
    using ae by (force intro: integral_cong_AE)
  also have "… = - (∫x. Pxy x * log b (Pxy x) ∂(S ⨂🪙M T)) - - (∫x. Pxy x * log b (Py (snd x)) ∂(S ⨂🪙M T))"
    by (simp add: Bochner_Integration.integral_diff[OF I1 I2])
  finally show ?thesis
    using conditional_entropy_generic_eq[OF S T Py Py_nn Pxy Pxy_nn, simplified]
      entropy_distr[OF Pxy **, simplified] e_eq
    by (simp add: split_beta')
qed

lemma (in information_space) conditional_entropy_eq_entropy_simple:
  assumes X: "simple_function M X" and Y: "simple_function M Y"
  shows "H(X | Y) = entropy b (count_space (X`space M) ⨂🪙M count_space (Y`space M)) (λx. (X x, Y x)) - H(Y)"
proof -
  have "count_space (X ` space M) ⨂🪙M count_space (Y ` space M) = count_space (X`space M × Y`space M)"
    (is "?P = ?C") using X Y by (simp add: simple_functionD pair_measure_count_space)
  show ?thesis
    by (rule conditional_entropy_eq_entropy sigma_finite_measure_count_space_finite
             simple_functionD  X Y simple_distributed simple_distributedI[OF _ _ refl]
             simple_distributed_joint simple_function_Pair integrable_count_space measure_nonneg)+
       (auto simp: ‹?P = ?C› measure_nonneg intro!: integrable_count_space simple_functionD  X Y)
qed

lemma (in information_space) conditional_entropy_eq:
  assumes Y: "simple_distributed M Y Py"
  assumes XY: "simple_distributed M (λx. (X x, Y x)) Pxy"
    shows "H(X | Y) = - (∑(x, y)∈(λx. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / Py y))"
proof (subst conditional_entropy_generic_eq[OF _ _
  simple_distributed[OF Y] _ simple_distributed_joint[OF XY]])
  have "finite ((λx. (X x, Y x))`space M)"
    using XY unfolding simple_distributed_def by auto
  from finite_imageI[OF this, of fst]
  have [simp]: "finite (X`space M)"
    by (simp add: image_comp comp_def)
  note Y[THEN simple_distributed_finite, simp]
  show "sigma_finite_measure (count_space (X ` space M))"
    by (simp add: sigma_finite_measure_count_space_finite)
  show "sigma_finite_measure (count_space (Y ` space M))"
    by (simp add: sigma_finite_measure_count_space_finite)
  let ?f = "(λx. if x ∈ (λx. (X x, Y x)) ` space M then Pxy x else 0)"
  have "count_space (X ` space M) ⨂🪙M count_space (Y ` space M) = count_space (X`space M × Y`space M)"
    (is "?P = ?C")
    using Y by (simp add: simple_distributed_finite pair_measure_count_space)
  have eq: "(λ(x, y). ?f (x, y) * log b (?f (x, y) / Py y)) =
    (λx. if x ∈ (λx. (X x, Y x)) ` space M then Pxy x * log b (Pxy x / Py (snd x)) else 0)"
    by auto
  from Y show "- (∫ (x, y). ?f (x, y) * log b (?f (x, y) / Py y) ∂?P) =
    - (∑(x, y)∈(λx. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / Py y))"
    by (auto intro!: sum.cong simp add: ‹?P = ?C› lebesgue_integral_count_space_finite simple_distributed_finite eq sum.If_cases split_beta')
qed (use Y XY in auto)

lemma (in information_space) conditional_mutual_information_eq_conditional_entropy:
  assumes X: "simple_function M X" and Y: "simple_function M Y"
  shows "I(X ; X | Y) = H(X | Y)"
proof -
  define Py where "Py x = (if x ∈ Y`space M then measure M (Y -` {x} ∩ space M) else 0)" for x
  define Pxy where "Pxy x =
      (if x ∈ (λx. (X x, Y x))`space M then measure M ((λx. (X x, Y x)) -` {x} ∩ space M) else 0)"
    for x
  define Pxxy where "Pxxy x =
      (if x ∈ (λx. (X x, X x, Y x))`space M then measure M ((λx. (X x, X x, Y x)) -` {x} ∩ space M)
       else 0)"
    for x
  let ?M = "X`space M × X`space M × Y`space M"

  note XY = simple_function_Pair[OF X Y]
  note XXY = simple_function_Pair[OF X XY]
  have Py: "simple_distributed M Y Py"
    using Y by (rule simple_distributedI) (auto simp: Py_def measure_nonneg)
  have Pxy: "simple_distributed M (λx. (X x, Y x)) Pxy"
    using XY by (rule simple_distributedI) (auto simp: Pxy_def measure_nonneg)
  have Pxxy: "simple_distributed M (λx. (X x, X x, Y x)) Pxxy"
    using XXY by (rule simple_distributedI) (auto simp: Pxxy_def measure_nonneg)
  have eq: "(λx. (X x, X x, Y x)) ` space M = (λ(x, y). (x, x, y)) ` (λx. (X x, Y x)) ` space M"
    by auto
  have inj: "∧A. inj_on (λ(x, y). (x, x, y)) A"
    by (auto simp: inj_on_def)
  have Pxxy_eq: "∧x y. Pxxy (x, x, y) = Pxy (x, y)"
    by (auto simp: Pxxy_def Pxy_def intro!: arg_cong[where f=prob])
  have "AE x in count_space ((λx. (X x, Y x))`space M). Py (snd x) = 0 ⟶ Pxy x = 0"
    using Py Pxy
    by (intro subdensity_real[of snd, OF _ Pxy[THEN simple_distributed] Py[THEN simple_distributed]])
       (auto intro: measurable_Pair simp: AE_count_space)
  then show ?thesis
    apply (subst conditional_mutual_information_eq[OF Py Pxy Pxy Pxxy])
    apply (subst conditional_entropy_eq[OF Py Pxy])
    apply (auto intro!: sum.cong simp: Pxxy_eq sum_negf[symmetric] eq sum.reindex[OF inj]
                log_simps zero_less_mult_iff zero_le_mult_iff field_simps mult_less_0_iff AE_count_space)
    done
qed

lemma (in information_space) conditional_entropy_nonneg:
  assumes X: "simple_function M X" and Y: "simple_function M Y" shows "0 ≤ H(X | Y)"
  using conditional_mutual_information_eq_conditional_entropy[OF X Y] conditional_mutual_information_nonneg[OF X X Y]
  by simp

subsection ‹Equalities›

lemma (in information_space) mutual_information_eq_entropy_conditional_entropy_distr:
  fixes Px :: "'b ==> real" and Py :: "'c ==> real" and Pxy :: "('b × 'c) ==> real"
  assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
  assumes Px[measurable]: "distributed M S X Px"
    and Px_nn[simp]: "∧x. x ∈ space S ==> 0 ≤ Px x"
    and Py[measurable]: "distributed M T Y Py"
    and Py_nn[simp]: "∧x. x ∈ space T ==> 0 ≤ Py x"
    and Pxy[measurable]: "distributed M (S ⨂🪙M T) (λx. (X x, Y x)) Pxy"
    and Pxy_nn[simp]: "∧x y. x ∈ space S ==> y ∈ space T ==> 0 ≤ Pxy (x, y)"
  assumes Ix: "integrable(S ⨂🪙M T) (λx. Pxy x * log b (Px (fst x)))"
  assumes Iy: "integrable(S ⨂🪙M T) (λx. Pxy x * log b (Py (snd x)))"
  assumes Ixy: "integrable(S ⨂🪙M T) (λx. Pxy x * log b (Pxy x))"
  shows  "mutual_information b S T X Y = entropy b S X + entropy b T Y - entropy b (S ⨂🪙M T) (λx. (X x, Y x))"
proof -
  have [measurable]: "Px ∈ S →🪙M borel"
    using Px Px_nn by (intro distributed_real_measurable)
  have [measurable]: "Py ∈ T →🪙M borel"
    using Py Py_nn by (intro distributed_real_measurable)
  have measurable_Pxy[measurable]: "Pxy ∈ (S ⨂🪙M T) →🪙M borel"
    using Pxy Pxy_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure)

  have X: "entropy b S X = - (∫x. Pxy x * log b (Px (fst x)) ∂(S ⨂🪙M T))"
    using b_gt_1
    apply (subst entropy_distr[OF Px Px_nn], simp)
    apply (subst distributed_transform_integral[OF Pxy _ Px, where T=fst])
    apply (auto intro!: integral_cong simp: space_pair_measure)
    done

  have Y: "entropy b T Y = - (∫x. Pxy x * log b (Py (snd x)) ∂(S ⨂🪙M T))"
    using b_gt_1
    apply (subst entropy_distr[OF Py Py_nn], simp)
    apply (subst distributed_transform_integral[OF Pxy _ Py, where T=snd])
    apply (auto intro!: integral_cong simp: space_pair_measure)
    done

  interpret S: sigma_finite_measure S by fact
  interpret T: sigma_finite_measure T by fact
  interpret ST: pair_sigma_finite S T ..
  have ST: "sigma_finite_measure (S ⨂🪙M T)" ..

  have XY: "entropy b (S ⨂🪙M T) (λx. (X x, Y x)) = - (∫x. Pxy x * log b (Pxy x) ∂(S ⨂🪙M T))"
    by (subst entropy_distr[OF Pxy]) (auto intro!: integral_cong simp: space_pair_measure)

  have "AE x in S ⨂🪙M T. Px (fst x) = 0 ⟶ Pxy x = 0"
    by (intro subdensity_real[of fst, OF _ Pxy Px]) (auto intro: measurable_Pair simp: space_pair_measure)
  moreover have "AE x in S ⨂🪙M T. Py (snd x) = 0 ⟶ Pxy x = 0"
    by (intro subdensity_real[of snd, OF _ Pxy Py]) (auto intro: measurable_Pair simp: space_pair_measure)
  moreover have "AE x in S ⨂🪙M T. 0 ≤ Px (fst x)"
    using Px by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_fst'')
  moreover have "AE x in S ⨂🪙M T. 0 ≤ Py (snd x)"
    using Py by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_snd'')
  ultimately have "AE x in S ⨂🪙M T. Pxy x * log b (Pxy x) - Pxy x * log b (Px (fst x)) - Pxy x * log b (Py (snd x)) =
    Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))"
    (is "AE x in _. ?f x = ?g x")
    using AE_space
  proof eventually_elim
    case (elim x)
    show ?case
    proof cases
      assume "Pxy x ≠ 0"
      with elim have "0 < Px (fst x)" "0 < Py (snd x)" "0 < Pxy x"
        by (auto simp: space_pair_measure less_le)
      then show ?thesis
        using b_gt_1 by (simp add: log_simps less_imp_le field_simps)
    qed simp
  qed

  have "entropy b S X + entropy b T Y - entropy b (S ⨂🪙M T) (λx. (X x, Y x)) = integral🪙L (S ⨂🪙M T) ?f"
    unfolding X Y XY
    apply (subst Bochner_Integration.integral_diff)
    apply (intro Bochner_Integration.integrable_diff Ixy Ix Iy)+
    apply (subst Bochner_Integration.integral_diff)
    apply (intro Ixy Ix Iy)+
    apply (simp add: field_simps)
    done
  also have "… = integral🪙L (S ⨂🪙M T) ?g"
    using ‹AE x in _. ?f x = ?g x› by (intro integral_cong_AE) auto
  also have "… = mutual_information b S T X Y"
    by (rule mutual_information_distr[OF S T Px _ Py _ Pxy _ , symmetric])
       (auto simp: space_pair_measure)
  finally show ?thesis ..
qed

lemma (in information_space) mutual_information_eq_entropy_conditional_entropy':
  fixes Px :: "'b ==> real" and Py :: "'c ==> real" and Pxy :: "('b × 'c) ==> real"
  assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
  assumes Px: "distributed M S X Px" "∧x. x ∈ space S ==> 0 ≤ Px x"
    and Py: "distributed M T Y Py" "∧x. x ∈ space T ==> 0 ≤ Py x"
  assumes Pxy: "distributed M (S ⨂🪙M T) (λx. (X x, Y x)) Pxy"
    "∧x. x ∈ space (S ⨂🪙M T) ==> 0 ≤ Pxy x"
  assumes Ix: "integrable(S ⨂🪙M T) (λx. Pxy x * log b (Px (fst x)))"
  assumes Iy: "integrable(S ⨂🪙M T) (λx. Pxy x * log b (Py (snd x)))"
  assumes Ixy: "integrable(S ⨂🪙M T) (λx. Pxy x * log b (Pxy x))"
  shows  "mutual_information b S T X Y = entropy b S X - conditional_entropy b S T X Y"
  using
    mutual_information_eq_entropy_conditional_entropy_distr[OF S T Px Py Pxy Ix Iy Ixy]
    conditional_entropy_eq_entropy[OF S T Py Pxy Ixy Iy]
  by (simp add: space_pair_measure)

lemma (in information_space) mutual_information_eq_entropy_conditional_entropy:
  assumes sf_X: "simple_function M X" and sf_Y: "simple_function M Y"
  shows  "I(X ; Y) = H(X) - H(X | Y)"
proof -
  have X: "simple_distributed M X (λx. measure M (X -` {x} ∩ space M))"
    using sf_X by (rule simple_distributedI) (auto simp: measure_nonneg)
  have Y: "simple_distributed M Y (λx. measure M (Y -` {x} ∩ space M))"
    using sf_Y by (rule simple_distributedI) (auto simp: measure_nonneg)
  have sf_XY: "simple_function M (λx. (X x, Y x))"
    using sf_X sf_Y by (rule simple_function_Pair)
  then have XY: "simple_distributed M (λx. (X x, Y x)) (λx. measure M ((λx. (X x, Y x)) -` {x} ∩ space M))"
    by (rule simple_distributedI) (auto simp: measure_nonneg)
  from simple_distributed_joint_finite[OF this, simp]
  have eq: "count_space (X ` space M) ⨂🪙M count_space (Y ` space M) = count_space (X ` space M × Y ` space M)"
    by (simp add: pair_measure_count_space)

  have "I(X ; Y) = H(X) + H(Y) - entropy b (count_space (X`space M) ⨂🪙M count_space (Y`space M)) (λx. (X x, Y x))"
    using sigma_finite_measure_count_space_finite
      sigma_finite_measure_count_space_finite
      simple_distributed[OF X] measure_nonneg simple_distributed[OF Y] measure_nonneg simple_distributed_joint[OF XY]
    by (rule mutual_information_eq_entropy_conditional_entropy_distr)
       (auto simp: eq integrable_count_space measure_nonneg)
  then show ?thesis
    unfolding conditional_entropy_eq_entropy_simple[OF sf_X sf_Y] by simp
qed

lemma (in information_space) mutual_information_nonneg_simple:
  assumes sf_X: "simple_function M X" and sf_Y: "simple_function M Y"
  shows  "0 ≤ I(X ; Y)"
proof -
  have X: "simple_distributed M X (λx. measure M (X -` {x} ∩ space M))"
    using sf_X by (rule simple_distributedI) (auto simp: measure_nonneg)
  have Y: "simple_distributed M Y (λx. measure M (Y -` {x} ∩ space M))"
    using sf_Y by (rule simple_distributedI) (auto simp: measure_nonneg)

  have sf_XY: "simple_function M (λx. (X x, Y x))"
    using sf_X sf_Y by (rule simple_function_Pair)
  then have XY: "simple_distributed M (λx. (X x, Y x)) (λx. measure M ((λx. (X x, Y x)) -` {x} ∩ space M))"
    by (rule simple_distributedI) (auto simp: measure_nonneg)

  from simple_distributed_joint_finite[OF this, simp]
  have eq: "count_space (X ` space M) ⨂🪙M count_space (Y ` space M) = count_space (X ` space M × Y ` space M)"
    by (simp add: pair_measure_count_space)

  show ?thesis
    by (rule mutual_information_nonneg[OF _ _ simple_distributed[OF X] _ simple_distributed[OF Y] _ simple_distributed_joint[OF XY]])
       (simp_all add: eq integrable_count_space sigma_finite_measure_count_space_finite measure_nonneg)
qed

lemma (in information_space) conditional_entropy_less_eq_entropy:
  assumes X: "simple_function M X" and Z: "simple_function M Z"
  shows "H(X | Z) ≤ H(X)"
proof -
  have "0 ≤ I(X ; Z)" using X Z by (rule mutual_information_nonneg_simple)
  also have "I(X ; Z) = H(X) - H(X | Z)" using mutual_information_eq_entropy_conditional_entropy[OF assms] .
  finally show ?thesis by auto
qed

lemma (in information_space)
  fixes Px :: "'b ==> real" and Py :: "'c ==> real" and Pxy :: "('b × 'c) ==> real"
  assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T"
  assumes Px: "finite_entropy S X Px" and Py: "finite_entropy T Y Py"
  assumes Pxy: "finite_entropy (S ⨂🪙M T) (λx. (X x, Y x)) Pxy"
  shows "conditional_entropy b S T X Y ≤ entropy b S X"
proof -
  have "0 ≤ mutual_information b S T X Y"
    by (rule mutual_information_nonneg') fact+
  also have "… = entropy b S X - conditional_entropy b S T X Y"
  proof (intro mutual_information_eq_entropy_conditional_entropy')
    show "integrable (S ⨂🪙M T) (λx. Pxy x * log b (Px (fst x)))"
         "integrable (S ⨂🪙M T) (λx. Pxy x * log b (Py (snd x)))"
         "integrable (S ⨂🪙M T) (λx. Pxy x * log b (Pxy x))"
      using assms
      by (auto intro!: finite_entropy_integrable finite_entropy_distributed
          finite_entropy_integrable_transform[OF Px] finite_entropy_integrable_transform[OF Py]
          intro: finite_entropy_nn)
  qed (use assms Px Py Pxy finite_entropy_nn finite_entropy_distributed in auto)
  finally show ?thesis by auto
qed

lemma (in information_space) entropy_chain_rule:
  assumes X: "simple_function M X" and Y: "simple_function M Y"
  shows  "H(λx. (X x, Y x)) = H(X) + H(Y|X)"
proof -
  note XY = simple_distributedI[OF simple_function_Pair[OF X Y] measure_nonneg refl]
  note YX = simple_distributedI[OF simple_function_Pair[OF Y X] measure_nonneg refl]
  note simple_distributed_joint_finite[OF this, simp]
  let ?f = "λx. prob ((λx. (X x, Y x)) -` {x} ∩ space M)"
  let ?g = "λx. prob ((λx. (Y x, X x)) -` {x} ∩ space M)"
  let ?h = "λx. if x ∈ (λx. (Y x, X x)) ` space M then prob ((λx. (Y x, X x)) -` {x} ∩ space M) else 0"
  have "H(λx. (X x, Y x)) = - (∑x∈(λx. (X x, Y x)) ` space M. ?f x * log b (?f x))"
    using XY by (rule entropy_simple_distributed)
  also have "… = - (∑x∈(λ(x, y). (y, x)) ` (λx. (X x, Y x)) ` space M. ?g x * log b (?g x))"
    by (subst (2) sum.reindex) (auto simp: inj_on_def intro!: sum.cong arg_cong[where f="λA. prob A * log b (prob A)"])
  also have "… = - (∑x∈(λx. (Y x, X x)) ` space M. ?h x * log b (?h x))"
    by (auto intro!: sum.cong)
  also have "… = entropy b (count_space (Y ` space M) ⨂🪙M count_space (X ` space M)) (λx. (Y x, X x))"
    by (subst entropy_distr[OF simple_distributed_joint[OF YX]])
       (auto simp: pair_measure_count_space sigma_finite_measure_count_space_finite lebesgue_integral_count_space_finite
             cong del: sum.cong_simp intro!: sum.mono_neutral_left measure_nonneg)
  finally have "H(λx. (X x, Y x)) = entropy b (count_space (Y ` space M) ⨂🪙M count_space (X ` space M)) (λx. (Y x, X x))" .
  then show ?thesis
    unfolding conditional_entropy_eq_entropy_simple[OF Y X] by simp
qed

lemma (in information_space) entropy_partition:
  assumes X: "simple_function M X"
  shows "H(X) = H(f ∘ X) + H(X|f ∘ X)"
proof -
  note fX = simple_function_compose[OF X, of f]
  have eq: "(λx. ((f ∘ X) x, X x)) ` space M = (λx. (f x, x)) ` X ` space M" by auto
  have inj: "∧A. inj_on (λx. (f x, x)) A"
    by (auto simp: inj_on_def)

  have "H(X) = - (∑x∈X ` space M. prob (X -` {x} ∩ space M) * log b (prob (X -` {x} ∩ space M)))"
    by (simp add: entropy_simple_distributed[OF simple_distributedI[OF X measure_nonneg refl]])
  also have "… = - (∑x∈(λx. ((f ∘ X) x, X x)) ` space M.
                       prob ((λx. ((f ∘ X) x, X x)) -` {x} ∩ space M) *
                       log b (prob ((λx. ((f ∘ X) x, X x)) -` {x} ∩ space M)))"
    unfolding eq
    apply (subst sum.reindex[OF inj])
    apply (auto intro!: sum.cong arg_cong[where f="λA. prob A * log b (prob A)"])
    done
  also have "... = H(λx. ((f ∘ X) x, X x))"
    using entropy_simple_distributed[OF simple_distributedI[OF simple_function_Pair[OF fX X] measure_nonneg refl]]
    by fastforce
  also have "… = H(f ∘ X) + H(X|f ∘ X)"
    using X entropy_chain_rule by blast
  finally show ?thesis .
qed

corollary (in information_space) entropy_data_processing:
  assumes "simple_function M X" shows "H(f ∘ X) ≤ H(X)"
  by (smt (verit) assms conditional_entropy_nonneg entropy_partition simple_function_compose)

corollary (in information_space) entropy_of_inj:
  assumes X: "simple_function M X" and inj: "inj_on f (X`space M)"
  shows "H(f ∘ X) = H(X)"
proof (rule antisym)
  show "H(f ∘ X) ≤ H(X)" using entropy_data_processing[OF X] .
next
  have sf: "simple_function M (f ∘ X)"
    using X by auto
  have "H(X) = H(the_inv_into (X`space M) f ∘ (f ∘ X))"
    unfolding o_assoc
    apply (subst entropy_simple_distributed[OF simple_distributedI[OF X measure_nonneg refl]])
    apply (subst entropy_simple_distributed[OF simple_distributedI[OF simple_function_compose[OF X]], where f="λx. prob (X -` {x} ∩ space M)"])
    apply (auto intro!: sum.cong arg_cong[where f=prob] image_eqI simp: the_inv_into_f_f[OF inj] comp_def measure_nonneg)
    done
  also have "... ≤ H(f ∘ X)"
    using entropy_data_processing[OF sf] .
  finally show "H(X) ≤ H(f ∘ X)" .
qed

end

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.59 Sekunden (vorverarbeitet am 2026-04-27) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.